src/amd/common/ac_llvm_build.c

   1 /*
   2  * Copyright 2014 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sub license, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  19  *
  20  * The above copyright notice and this permission notice (including the
  21  * next paragraph) shall be included in all copies or substantial portions
  22  * of the Software.
  23  *
  24  */
  25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
  26 #include "ac_llvm_build.h"
  27
  28 #include <llvm-c/Core.h>
  29
  30 #include "c11/threads.h"
  31
  32 #include <assert.h>
  33 #include <stdio.h>
  34
  35 #include "ac_llvm_util.h"
  36 #include "ac_exp_param.h"
  37 #include "util/bitscan.h"
  38 #include "util/macros.h"
  39 #include "util/u_atomic.h"
  40 #include "util/u_math.h"
  41 #include "sid.h"
  42
  43 #include "shader_enums.h"
  44
  45 #define AC_LLVM_INITIAL_CF_DEPTH 4
  46
  47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
  48  */
  49 struct ac_llvm_flow {
  50         /* Loop exit or next part of if/else/endif. */
  51         LLVMBasicBlockRef next_block;
  52         LLVMBasicBlockRef loop_entry_block;
  53 };
  54
  55 /* Initialize module-independent parts of the context.
  56  *
  57  * The caller is responsible for initializing ctx::module and ctx::builder.
  58  */
  59 void
  60 ac_llvm_context_init(struct ac_llvm_context *ctx,
  61                      enum chip_class chip_class, enum radeon_family family)
  62 {
  63         LLVMValueRef args[1];
  64
  65         ctx->context = LLVMContextCreate();
  66
  67         ctx->chip_class = chip_class;
  68         ctx->family = family;
  69         ctx->module = NULL;
  70         ctx->builder = NULL;
  71
  72         ctx->voidt = LLVMVoidTypeInContext(ctx->context);
  73         ctx->i1 = LLVMInt1TypeInContext(ctx->context);
  74         ctx->i8 = LLVMInt8TypeInContext(ctx->context);
  75         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
  76         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
  77         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
  78         ctx->intptr = ctx->i32;
  79         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
  80         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
  81         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
  82         ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
  83         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
  84         ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
  85         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
  86         ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
  87         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  88         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  89
  90         ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
  91         ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
  92         ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
  93         ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
  94         ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
  95         ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
  96         ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
  97         ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
  98         ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
  99         ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
 100         ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
 101         ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 102         ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
 103         ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
 104
 105         ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
 106         ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
 107
 108         ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 109                                                      "range", 5);
 110
 111         ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 112                                                                "invariant.load", 14);
 113
 114         ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
 115
 116         args[0] = LLVMConstReal(ctx->f32, 2.5);
 117         ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
 118
 119         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 120                                                         "amdgpu.uniform", 14);
 121
 122         ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
 123 }
 124
 125 void
 126 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
 127 {
 128         free(ctx->flow);
 129         ctx->flow = NULL;
 130         ctx->flow_depth_max = 0;
 131 }
 132
 133 int
 134 ac_get_llvm_num_components(LLVMValueRef value)
 135 {
 136         LLVMTypeRef type = LLVMTypeOf(value);
 137         unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
 138                                       ? LLVMGetVectorSize(type)
 139                                       : 1;
 140         return num_components;
 141 }
 142
 143 LLVMValueRef
 144 ac_llvm_extract_elem(struct ac_llvm_context *ac,
 145                      LLVMValueRef value,
 146                      int index)
 147 {
 148         if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
 149                 assert(index == 0);
 150                 return value;
 151         }
 152
 153         return LLVMBuildExtractElement(ac->builder, value,
 154                                        LLVMConstInt(ac->i32, index, false), "");
 155 }
 156
 157 int
 158 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
 159 {
 160         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 161                 type = LLVMGetElementType(type);
 162
 163         if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
 164                 return LLVMGetIntTypeWidth(type);
 165
 166         if (type == ctx->f16)
 167                 return 16;
 168         if (type == ctx->f32)
 169                 return 32;
 170         if (type == ctx->f64)
 171                 return 64;
 172
 173         unreachable("Unhandled type kind in get_elem_bits");
 174 }
 175
 176 unsigned
 177 ac_get_type_size(LLVMTypeRef type)
 178 {
 179         LLVMTypeKind kind = LLVMGetTypeKind(type);
 180
 181         switch (kind) {
 182         case LLVMIntegerTypeKind:
 183                 return LLVMGetIntTypeWidth(type) / 8;
 184         case LLVMHalfTypeKind:
 185                 return 2;
 186         case LLVMFloatTypeKind:
 187                 return 4;
 188         case LLVMDoubleTypeKind:
 189                 return 8;
 190         case LLVMPointerTypeKind:
 191                 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
 192                         return 4;
 193                 return 8;
 194         case LLVMVectorTypeKind:
 195                 return LLVMGetVectorSize(type) *
 196                        ac_get_type_size(LLVMGetElementType(type));
 197         case LLVMArrayTypeKind:
 198                 return LLVMGetArrayLength(type) *
 199                        ac_get_type_size(LLVMGetElementType(type));
 200         default:
 201                 assert(0);
 202                 return 0;
 203         }
 204 }
 205
 206 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 207 {
 208         if (t == ctx->i8)
 209                 return ctx->i8;
 210         else if (t == ctx->f16 || t == ctx->i16)
 211                 return ctx->i16;
 212         else if (t == ctx->f32 || t == ctx->i32)
 213                 return ctx->i32;
 214         else if (t == ctx->f64 || t == ctx->i64)
 215                 return ctx->i64;
 216         else
 217                 unreachable("Unhandled integer size");
 218 }
 219
 220 LLVMTypeRef
 221 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 222 {
 223         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 224                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 225                 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
 226                                       LLVMGetVectorSize(t));
 227         }
 228         if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
 229                 switch (LLVMGetPointerAddressSpace(t)) {
 230                 case AC_ADDR_SPACE_GLOBAL:
 231                         return ctx->i64;
 232                 case AC_ADDR_SPACE_LDS:
 233                         return ctx->i32;
 234                 default:
 235                         unreachable("unhandled address space");
 236                 }
 237         }
 238         return to_integer_type_scalar(ctx, t);
 239 }
 240
 241 LLVMValueRef
 242 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 243 {
 244         LLVMTypeRef type = LLVMTypeOf(v);
 245         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
 246                 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 247         }
 248         return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 249 }
 250
 251 LLVMValueRef
 252 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
 253 {
 254         LLVMTypeRef type = LLVMTypeOf(v);
 255         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
 256                 return v;
 257         return ac_to_integer(ctx, v);
 258 }
 259
 260 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 261 {
 262         if (t == ctx->i8)
 263                 return ctx->i8;
 264         else if (t == ctx->i16 || t == ctx->f16)
 265                 return ctx->f16;
 266         else if (t == ctx->i32 || t == ctx->f32)
 267                 return ctx->f32;
 268         else if (t == ctx->i64 || t == ctx->f64)
 269                 return ctx->f64;
 270         else
 271                 unreachable("Unhandled float size");
 272 }
 273
 274 LLVMTypeRef
 275 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 276 {
 277         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 278                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 279                 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
 280                                       LLVMGetVectorSize(t));
 281         }
 282         return to_float_type_scalar(ctx, t);
 283 }
 284
 285 LLVMValueRef
 286 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
 287 {
 288         LLVMTypeRef type = LLVMTypeOf(v);
 289         return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
 290 }
 291
 292
 293 LLVMValueRef
 294 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
 295                    LLVMTypeRef return_type, LLVMValueRef *params,
 296                    unsigned param_count, unsigned attrib_mask)
 297 {
 298         LLVMValueRef function, call;
 299         bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 300
 301         function = LLVMGetNamedFunction(ctx->module, name);
 302         if (!function) {
 303                 LLVMTypeRef param_types[32], function_type;
 304                 unsigned i;
 305
 306                 assert(param_count <= 32);
 307
 308                 for (i = 0; i < param_count; ++i) {
 309                         assert(params[i]);
 310                         param_types[i] = LLVMTypeOf(params[i]);
 311                 }
 312                 function_type =
 313                     LLVMFunctionType(return_type, param_types, param_count, 0);
 314                 function = LLVMAddFunction(ctx->module, name, function_type);
 315
 316                 LLVMSetFunctionCallConv(function, LLVMCCallConv);
 317                 LLVMSetLinkage(function, LLVMExternalLinkage);
 318
 319                 if (!set_callsite_attrs)
 320                         ac_add_func_attributes(ctx->context, function, attrib_mask);
 321         }
 322
 323         call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
 324         if (set_callsite_attrs)
 325                 ac_add_func_attributes(ctx->context, call, attrib_mask);
 326         return call;
 327 }
 328
 329 /**
 330  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
 331  * intrinsic names).
 332  */
 333 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
 334 {
 335         LLVMTypeRef elem_type = type;
 336
 337         assert(bufsize >= 8);
 338
 339         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
 340                 int ret = snprintf(buf, bufsize, "v%u",
 341                                         LLVMGetVectorSize(type));
 342                 if (ret < 0) {
 343                         char *type_name = LLVMPrintTypeToString(type);
 344                         fprintf(stderr, "Error building type name for: %s\n",
 345                                 type_name);
 346                         return;
 347                 }
 348                 elem_type = LLVMGetElementType(type);
 349                 buf += ret;
 350                 bufsize -= ret;
 351         }
 352         switch (LLVMGetTypeKind(elem_type)) {
 353         default: break;
 354         case LLVMIntegerTypeKind:
 355                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 356                 break;
 357         case LLVMHalfTypeKind:
 358                 snprintf(buf, bufsize, "f16");
 359                 break;
 360         case LLVMFloatTypeKind:
 361                 snprintf(buf, bufsize, "f32");
 362                 break;
 363         case LLVMDoubleTypeKind:
 364                 snprintf(buf, bufsize, "f64");
 365                 break;
 366         }
 367 }
 368
 369 /**
 370  * Helper function that builds an LLVM IR PHI node and immediately adds
 371  * incoming edges.
 372  */
 373 LLVMValueRef
 374 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
 375              unsigned count_incoming, LLVMValueRef *values,
 376              LLVMBasicBlockRef *blocks)
 377 {
 378         LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
 379         LLVMAddIncoming(phi, values, blocks, count_incoming);
 380         return phi;
 381 }
 382
 383 void ac_build_s_barrier(struct ac_llvm_context *ctx)
 384 {
 385         ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
 386                            0, AC_FUNC_ATTR_CONVERGENT);
 387 }
 388
 389 /* Prevent optimizations (at least of memory accesses) across the current
 390  * point in the program by emitting empty inline assembly that is marked as
 391  * having side effects.
 392  *
 393  * Optionally, a value can be passed through the inline assembly to prevent
 394  * LLVM from hoisting calls to ReadNone functions.
 395  */
 396 void
 397 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
 398                               LLVMValueRef *pvgpr)
 399 {
 400         static int counter = 0;
 401
 402         LLVMBuilderRef builder = ctx->builder;
 403         char code[16];
 404
 405         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
 406
 407         if (!pvgpr) {
 408                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
 409                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
 410                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
 411         } else {
 412                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
 413                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
 414                 LLVMValueRef vgpr = *pvgpr;
 415                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
 416                 unsigned vgpr_size = ac_get_type_size(vgpr_type);
 417                 LLVMValueRef vgpr0;
 418
 419                 assert(vgpr_size % 4 == 0);
 420
 421                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
 422                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
 423                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
 424                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
 425                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
 426
 427                 *pvgpr = vgpr;
 428         }
 429 }
 430
 431 LLVMValueRef
 432 ac_build_shader_clock(struct ac_llvm_context *ctx)
 433 {
 434         LLVMValueRef tmp = ac_build_intrinsic(ctx, "llvm.readcyclecounter",
 435                                               ctx->i64, NULL, 0, 0);
 436         return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
 437 }
 438
 439 LLVMValueRef
 440 ac_build_ballot(struct ac_llvm_context *ctx,
 441                 LLVMValueRef value)
 442 {
 443         LLVMValueRef args[3] = {
 444                 value,
 445                 ctx->i32_0,
 446                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
 447         };
 448
 449         /* We currently have no other way to prevent LLVM from lifting the icmp
 450          * calls to a dominating basic block.
 451          */
 452         ac_build_optimization_barrier(ctx, &args[0]);
 453
 454         args[0] = ac_to_integer(ctx, args[0]);
 455
 456         return ac_build_intrinsic(ctx,
 457                                   "llvm.amdgcn.icmp.i32",
 458                                   ctx->i64, args, 3,
 459                                   AC_FUNC_ATTR_NOUNWIND |
 460                                   AC_FUNC_ATTR_READNONE |
 461                                   AC_FUNC_ATTR_CONVERGENT);
 462 }
 463
 464 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
 465                                  LLVMValueRef value)
 466 {
 467         LLVMValueRef args[3] = {
 468                 value,
 469                 ctx->i1false,
 470                 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
 471         };
 472
 473         assert(HAVE_LLVM >= 0x0800);
 474         return ac_build_intrinsic(ctx, "llvm.amdgcn.icmp.i1", ctx->i64, args, 3,
 475                                   AC_FUNC_ATTR_NOUNWIND |
 476                                   AC_FUNC_ATTR_READNONE |
 477                                   AC_FUNC_ATTR_CONVERGENT);
 478 }
 479
 480 LLVMValueRef
 481 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
 482 {
 483         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 484         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 485         return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
 486 }
 487
 488 LLVMValueRef
 489 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
 490 {
 491         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 492         return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
 493                              LLVMConstInt(ctx->i64, 0, 0), "");
 494 }
 495
 496 LLVMValueRef
 497 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
 498 {
 499         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 500         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 501
 502         LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 503                                          vote_set, active_set, "");
 504         LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 505                                           vote_set,
 506                                           LLVMConstInt(ctx->i64, 0, 0), "");
 507         return LLVMBuildOr(ctx->builder, all, none, "");
 508 }
 509
 510 LLVMValueRef
 511 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
 512                                unsigned value_count, unsigned component)
 513 {
 514         LLVMValueRef vec = NULL;
 515
 516         if (value_count == 1) {
 517                 return values[component];
 518         } else if (!value_count)
 519                 unreachable("value_count is 0");
 520
 521         for (unsigned i = component; i < value_count + component; i++) {
 522                 LLVMValueRef value = values[i];
 523
 524                 if (i == component)
 525                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 526                 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
 527                 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
 528         }
 529         return vec;
 530 }
 531
 532 LLVMValueRef
 533 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 534                                 LLVMValueRef *values,
 535                                 unsigned value_count,
 536                                 unsigned value_stride,
 537                                 bool load,
 538                                 bool always_vector)
 539 {
 540         LLVMBuilderRef builder = ctx->builder;
 541         LLVMValueRef vec = NULL;
 542         unsigned i;
 543
 544         if (value_count == 1 && !always_vector) {
 545                 if (load)
 546                         return LLVMBuildLoad(builder, values[0], "");
 547                 return values[0];
 548         } else if (!value_count)
 549                 unreachable("value_count is 0");
 550
 551         for (i = 0; i < value_count; i++) {
 552                 LLVMValueRef value = values[i * value_stride];
 553                 if (load)
 554                         value = LLVMBuildLoad(builder, value, "");
 555
 556                 if (!i)
 557                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 558                 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
 559                 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
 560         }
 561         return vec;
 562 }
 563
 564 LLVMValueRef
 565 ac_build_gather_values(struct ac_llvm_context *ctx,
 566                        LLVMValueRef *values,
 567                        unsigned value_count)
 568 {
 569         return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 570 }
 571
 572 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
 573  * channels with undef. Extract at most src_channels components from the input.
 574  */
 575 static LLVMValueRef
 576 ac_build_expand(struct ac_llvm_context *ctx,
 577                 LLVMValueRef value,
 578                 unsigned src_channels,
 579                 unsigned dst_channels)
 580 {
 581         LLVMTypeRef elemtype;
 582         LLVMValueRef chan[dst_channels];
 583
 584         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
 585                 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
 586
 587                 if (src_channels == dst_channels && vec_size == dst_channels)
 588                         return value;
 589
 590                 src_channels = MIN2(src_channels, vec_size);
 591
 592                 for (unsigned i = 0; i < src_channels; i++)
 593                         chan[i] = ac_llvm_extract_elem(ctx, value, i);
 594
 595                 elemtype = LLVMGetElementType(LLVMTypeOf(value));
 596         } else {
 597                 if (src_channels) {
 598                         assert(src_channels == 1);
 599                         chan[0] = value;
 600                 }
 601                 elemtype = LLVMTypeOf(value);
 602         }
 603
 604         for (unsigned i = src_channels; i < dst_channels; i++)
 605                 chan[i] = LLVMGetUndef(elemtype);
 606
 607         return ac_build_gather_values(ctx, chan, dst_channels);
 608 }
 609
 610 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
 611  * with undef. Extract at most num_channels components from the input.
 612  */
 613 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
 614                                      LLVMValueRef value,
 615                                      unsigned num_channels)
 616 {
 617         return ac_build_expand(ctx, value, num_channels, 4);
 618 }
 619
 620 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
 621 {
 622         unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
 623         const char *name;
 624
 625         if (type_size == 2)
 626                 name = "llvm.rint.f16";
 627         else if (type_size == 4)
 628                 name = "llvm.rint.f32";
 629         else
 630                 name = "llvm.rint.f64";
 631
 632         return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
 633                                   AC_FUNC_ATTR_READNONE);
 634 }
 635
 636 LLVMValueRef
 637 ac_build_fdiv(struct ac_llvm_context *ctx,
 638               LLVMValueRef num,
 639               LLVMValueRef den)
 640 {
 641         /* If we do (num / den), LLVM >= 7.0 does:
 642          *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
 643          *
 644          * If we do (num * (1 / den)), LLVM does:
 645          *    return num * v_rcp_f32(den);
 646          */
 647         LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
 648         LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
 649         LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 650
 651         /* Use v_rcp_f32 instead of precise division. */
 652         if (!LLVMIsConstant(ret))
 653                 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
 654         return ret;
 655 }
 656
 657 /* See fast_idiv_by_const.h. */
 658 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
 659 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
 660                                 LLVMValueRef num,
 661                                 LLVMValueRef multiplier,
 662                                 LLVMValueRef pre_shift,
 663                                 LLVMValueRef post_shift,
 664                                 LLVMValueRef increment)
 665 {
 666         LLVMBuilderRef builder = ctx->builder;
 667
 668         num = LLVMBuildLShr(builder, num, pre_shift, "");
 669         num = LLVMBuildMul(builder,
 670                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 671                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 672         num = LLVMBuildAdd(builder, num,
 673                            LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
 674         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 675         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 676         return LLVMBuildLShr(builder, num, post_shift, "");
 677 }
 678
 679 /* See fast_idiv_by_const.h. */
 680 /* If num != UINT_MAX, this more efficient version can be used. */
 681 /* Set: increment = util_fast_udiv_info::increment; */
 682 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
 683                                     LLVMValueRef num,
 684                                     LLVMValueRef multiplier,
 685                                     LLVMValueRef pre_shift,
 686                                     LLVMValueRef post_shift,
 687                                     LLVMValueRef increment)
 688 {
 689         LLVMBuilderRef builder = ctx->builder;
 690
 691         num = LLVMBuildLShr(builder, num, pre_shift, "");
 692         num = LLVMBuildNUWAdd(builder, num, increment, "");
 693         num = LLVMBuildMul(builder,
 694                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 695                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 696         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 697         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 698         return LLVMBuildLShr(builder, num, post_shift, "");
 699 }
 700
 701 /* See fast_idiv_by_const.h. */
 702 /* Both operands must fit in 31 bits and the divisor must not be 1. */
 703 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
 704                                               LLVMValueRef num,
 705                                               LLVMValueRef multiplier,
 706                                               LLVMValueRef post_shift)
 707 {
 708         LLVMBuilderRef builder = ctx->builder;
 709
 710         num = LLVMBuildMul(builder,
 711                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 712                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 713         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 714         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 715         return LLVMBuildLShr(builder, num, post_shift, "");
 716 }
 717
 718 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
 719  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
 720  * already multiplied by two. id is the cube face number.
 721  */
 722 struct cube_selection_coords {
 723         LLVMValueRef stc[2];
 724         LLVMValueRef ma;
 725         LLVMValueRef id;
 726 };
 727
 728 static void
 729 build_cube_intrinsic(struct ac_llvm_context *ctx,
 730                      LLVMValueRef in[3],
 731                      struct cube_selection_coords *out)
 732 {
 733         LLVMTypeRef f32 = ctx->f32;
 734
 735         out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
 736                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 737         out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
 738                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 739         out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
 740                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 741         out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
 742                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 743 }
 744
 745 /**
 746  * Build a manual selection sequence for cube face sc/tc coordinates and
 747  * major axis vector (multiplied by 2 for consistency) for the given
 748  * vec3 \p coords, for the face implied by \p selcoords.
 749  *
 750  * For the major axis, we always adjust the sign to be in the direction of
 751  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
 752  * the selcoords major axis.
 753  */
 754 static void build_cube_select(struct ac_llvm_context *ctx,
 755                               const struct cube_selection_coords *selcoords,
 756                               const LLVMValueRef *coords,
 757                               LLVMValueRef *out_st,
 758                               LLVMValueRef *out_ma)
 759 {
 760         LLVMBuilderRef builder = ctx->builder;
 761         LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
 762         LLVMValueRef is_ma_positive;
 763         LLVMValueRef sgn_ma;
 764         LLVMValueRef is_ma_z, is_not_ma_z;
 765         LLVMValueRef is_ma_y;
 766         LLVMValueRef is_ma_x;
 767         LLVMValueRef sgn;
 768         LLVMValueRef tmp;
 769
 770         is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
 771                 selcoords->ma, LLVMConstReal(f32, 0.0), "");
 772         sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
 773                 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
 774
 775         is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
 776         is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
 777         is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
 778                 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
 779         is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 780
 781         /* Select sc */
 782         tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
 783         sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
 784                 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
 785                         LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 786         out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 787
 788         /* Select tc */
 789         tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
 790         sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
 791                 LLVMConstReal(f32, -1.0), "");
 792         out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 793
 794         /* Select ma */
 795         tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
 796                 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
 797         tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 798                                  ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
 799         *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
 800 }
 801
 802 void
 803 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
 804                        bool is_deriv, bool is_array, bool is_lod,
 805                        LLVMValueRef *coords_arg,
 806                        LLVMValueRef *derivs_arg)
 807 {
 808
 809         LLVMBuilderRef builder = ctx->builder;
 810         struct cube_selection_coords selcoords;
 811         LLVMValueRef coords[3];
 812         LLVMValueRef invma;
 813
 814         if (is_array && !is_lod) {
 815                 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
 816
 817                 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
 818                  *
 819                  *    "For Array forms, the array layer used will be
 820                  *
 821                  *       max(0, min(d−1, floor(layer+0.5)))
 822                  *
 823                  *     where d is the depth of the texture array and layer
 824                  *     comes from the component indicated in the tables below.
 825                  *     Workaroudn for an issue where the layer is taken from a
 826                  *     helper invocation which happens to fall on a different
 827                  *     layer due to extrapolation."
 828                  *
 829                  * GFX8 and earlier attempt to implement this in hardware by
 830                  * clamping the value of coords[2] = (8 * layer) + face.
 831                  * Unfortunately, this means that the we end up with the wrong
 832                  * face when clamping occurs.
 833                  *
 834                  * Clamp the layer earlier to work around the issue.
 835                  */
 836                 if (ctx->chip_class <= GFX8) {
 837                         LLVMValueRef ge0;
 838                         ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
 839                         tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
 840                 }
 841
 842                 coords_arg[3] = tmp;
 843         }
 844
 845         build_cube_intrinsic(ctx, coords_arg, &selcoords);
 846
 847         invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 848                         ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
 849         invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
 850
 851         for (int i = 0; i < 2; ++i)
 852                 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
 853
 854         coords[2] = selcoords.id;
 855
 856         if (is_deriv && derivs_arg) {
 857                 LLVMValueRef derivs[4];
 858                 int axis;
 859
 860                 /* Convert cube derivatives to 2D derivatives. */
 861                 for (axis = 0; axis < 2; axis++) {
 862                         LLVMValueRef deriv_st[2];
 863                         LLVMValueRef deriv_ma;
 864
 865                         /* Transform the derivative alongside the texture
 866                          * coordinate. Mathematically, the correct formula is
 867                          * as follows. Assume we're projecting onto the +Z face
 868                          * and denote by dx/dh the derivative of the (original)
 869                          * X texture coordinate with respect to horizontal
 870                          * window coordinates. The projection onto the +Z face
 871                          * plane is:
 872                          *
 873                          *   f(x,z) = x/z
 874                          *
 875                          * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
 876                          *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
 877                          *
 878                          * This motivatives the implementation below.
 879                          *
 880                          * Whether this actually gives the expected results for
 881                          * apps that might feed in derivatives obtained via
 882                          * finite differences is anyone's guess. The OpenGL spec
 883                          * seems awfully quiet about how textureGrad for cube
 884                          * maps should be handled.
 885                          */
 886                         build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
 887                                           deriv_st, &deriv_ma);
 888
 889                         deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
 890
 891                         for (int i = 0; i < 2; ++i)
 892                                 derivs[axis * 2 + i] =
 893                                         LLVMBuildFSub(builder,
 894                                                 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
 895                                                 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
 896                 }
 897
 898                 memcpy(derivs_arg, derivs, sizeof(derivs));
 899         }
 900
 901         /* Shift the texture coordinate. This must be applied after the
 902          * derivative calculation.
 903          */
 904         for (int i = 0; i < 2; ++i)
 905                 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
 906
 907         if (is_array) {
 908                 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 909                 /* coords_arg.w component - array_index for cube arrays */
 910                 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
 911         }
 912
 913         memcpy(coords_arg, coords, sizeof(coords));
 914 }
 915
 916
 917 LLVMValueRef
 918 ac_build_fs_interp(struct ac_llvm_context *ctx,
 919                    LLVMValueRef llvm_chan,
 920                    LLVMValueRef attr_number,
 921                    LLVMValueRef params,
 922                    LLVMValueRef i,
 923                    LLVMValueRef j)
 924 {
 925         LLVMValueRef args[5];
 926         LLVMValueRef p1;
 927
 928         args[0] = i;
 929         args[1] = llvm_chan;
 930         args[2] = attr_number;
 931         args[3] = params;
 932
 933         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
 934                                 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 935
 936         args[0] = p1;
 937         args[1] = j;
 938         args[2] = llvm_chan;
 939         args[3] = attr_number;
 940         args[4] = params;
 941
 942         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
 943                                   ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 944 }
 945
 946 LLVMValueRef
 947 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
 948                        LLVMValueRef llvm_chan,
 949                        LLVMValueRef attr_number,
 950                        LLVMValueRef params,
 951                        LLVMValueRef i,
 952                        LLVMValueRef j)
 953 {
 954         LLVMValueRef args[6];
 955         LLVMValueRef p1;
 956
 957         args[0] = i;
 958         args[1] = llvm_chan;
 959         args[2] = attr_number;
 960         args[3] = ctx->i1false;
 961         args[4] = params;
 962
 963         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
 964                                 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 965
 966         args[0] = p1;
 967         args[1] = j;
 968         args[2] = llvm_chan;
 969         args[3] = attr_number;
 970         args[4] = ctx->i1false;
 971         args[5] = params;
 972
 973         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
 974                                   ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
 975 }
 976
 977 LLVMValueRef
 978 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 979                        LLVMValueRef parameter,
 980                        LLVMValueRef llvm_chan,
 981                        LLVMValueRef attr_number,
 982                        LLVMValueRef params)
 983 {
 984         LLVMValueRef args[4];
 985
 986         args[0] = parameter;
 987         args[1] = llvm_chan;
 988         args[2] = attr_number;
 989         args[3] = params;
 990
 991         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
 992                                   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 993 }
 994
 995 LLVMValueRef
 996 ac_build_gep_ptr(struct ac_llvm_context *ctx,
 997                  LLVMValueRef base_ptr,
 998                  LLVMValueRef index)
 999 {
1000         return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1001 }
1002
1003 LLVMValueRef
1004 ac_build_gep0(struct ac_llvm_context *ctx,
1005               LLVMValueRef base_ptr,
1006               LLVMValueRef index)
1007 {
1008         LLVMValueRef indices[2] = {
1009                 ctx->i32_0,
1010                 index,
1011         };
1012         return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1013 }
1014
1015 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
1016                                   LLVMValueRef index)
1017 {
1018         return LLVMBuildPointerCast(ctx->builder,
1019                                     LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1020                                     LLVMTypeOf(ptr), "");
1021 }
1022
1023 void
1024 ac_build_indexed_store(struct ac_llvm_context *ctx,
1025                        LLVMValueRef base_ptr, LLVMValueRef index,
1026                        LLVMValueRef value)
1027 {
1028         LLVMBuildStore(ctx->builder, value,
1029                        ac_build_gep0(ctx, base_ptr, index));
1030 }
1031
1032 /**
1033  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1034  * It's equivalent to doing a load from &base_ptr[index].
1035  *
1036  * \param base_ptr  Where the array starts.
1037  * \param index     The element index into the array.
1038  * \param uniform   Whether the base_ptr and index can be assumed to be
1039  *                  dynamically uniform (i.e. load to an SGPR)
1040  * \param invariant Whether the load is invariant (no other opcodes affect it)
1041  * \param no_unsigned_wraparound
1042  *    For all possible re-associations and re-distributions of an expression
1043  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1044  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1045  *    does not result in an unsigned integer wraparound. This is used for
1046  *    optimal code generation of 32-bit pointer arithmetic.
1047  *
1048  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1049  *    integer wraparound can't be an imm offset in s_load_dword, because
1050  *    the instruction performs "addr + offset" in 64 bits.
1051  *
1052  *    Expected usage for bindless textures by chaining GEPs:
1053  *      // possible unsigned wraparound, don't use InBounds:
1054  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1055  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1056  *
1057  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1058  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1059  */
1060 static LLVMValueRef
1061 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1062                      LLVMValueRef index, bool uniform, bool invariant,
1063                      bool no_unsigned_wraparound)
1064 {
1065         LLVMValueRef pointer, result;
1066
1067         if (no_unsigned_wraparound &&
1068             LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1069                 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1070         else
1071                 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1072
1073         if (uniform)
1074                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1075         result = LLVMBuildLoad(ctx->builder, pointer, "");
1076         if (invariant)
1077                 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1078         return result;
1079 }
1080
1081 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1082                            LLVMValueRef index)
1083 {
1084         return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1085 }
1086
1087 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1088                                      LLVMValueRef base_ptr, LLVMValueRef index)
1089 {
1090         return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1091 }
1092
1093 /* This assumes that there is no unsigned integer wraparound during the address
1094  * computation, excluding all GEPs within base_ptr. */
1095 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1096                                    LLVMValueRef base_ptr, LLVMValueRef index)
1097 {
1098         return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1099 }
1100
1101 /* See ac_build_load_custom() documentation. */
1102 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1103                                    LLVMValueRef base_ptr, LLVMValueRef index)
1104 {
1105         return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1106 }
1107
1108 static void
1109 ac_build_buffer_store_common(struct ac_llvm_context *ctx,
1110                              LLVMValueRef rsrc,
1111                              LLVMValueRef data,
1112                              LLVMValueRef vindex,
1113                              LLVMValueRef voffset,
1114                              unsigned num_channels,
1115                              bool glc,
1116                              bool slc,
1117                              bool writeonly_memory,
1118                              bool use_format)
1119 {
1120         LLVMValueRef args[] = {
1121                 data,
1122                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1123                 vindex ? vindex : ctx->i32_0,
1124                 voffset,
1125                 LLVMConstInt(ctx->i1, glc, 0),
1126                 LLVMConstInt(ctx->i1, slc, 0)
1127         };
1128         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1129
1130         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1131         char name[256];
1132
1133         if (use_format) {
1134                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
1135                          type_names[func]);
1136         } else {
1137                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
1138                          type_names[func]);
1139         }
1140
1141         ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
1142                            ac_get_store_intr_attribs(writeonly_memory));
1143 }
1144
1145 static void
1146 ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
1147                                    LLVMValueRef rsrc,
1148                                    LLVMValueRef data,
1149                                    LLVMValueRef vindex,
1150                                    LLVMValueRef voffset,
1151                                    LLVMValueRef soffset,
1152                                    unsigned num_channels,
1153                                    LLVMTypeRef return_channel_type,
1154                                    bool glc,
1155                                    bool slc,
1156                                    bool writeonly_memory,
1157                                    bool use_format,
1158                                    bool structurized)
1159 {
1160         LLVMValueRef args[6];
1161         int idx = 0;
1162         args[idx++] = data;
1163         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1164         if (structurized)
1165                 args[idx++] = vindex ? vindex : ctx->i32_0;
1166         args[idx++] = voffset ? voffset : ctx->i32_0;
1167         args[idx++] = soffset ? soffset : ctx->i32_0;
1168         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1169         unsigned func = num_channels == 3 ? 4 : num_channels;
1170         const char *indexing_kind = structurized ? "struct" : "raw";
1171         char name[256], type_name[8];
1172
1173         LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
1174         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1175
1176         if (use_format) {
1177                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1178                          indexing_kind, type_name);
1179         } else {
1180                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1181                          indexing_kind, type_name);
1182         }
1183
1184         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1185                            ac_get_store_intr_attribs(writeonly_memory));
1186 }
1187
1188 void
1189 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1190                              LLVMValueRef rsrc,
1191                              LLVMValueRef data,
1192                              LLVMValueRef vindex,
1193                              LLVMValueRef voffset,
1194                              unsigned num_channels,
1195                              bool glc,
1196                              bool writeonly_memory)
1197 {
1198         if (HAVE_LLVM >= 0x800) {
1199                 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
1200                                                    voffset, NULL, num_channels,
1201                                                    ctx->f32, glc, false,
1202                                                    writeonly_memory, true, true);
1203         } else {
1204                 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset,
1205                                              num_channels, glc, false,
1206                                              writeonly_memory, true);
1207         }
1208 }
1209
1210 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1211  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1212  * or v4i32 (num_channels=3,4).
1213  */
1214 void
1215 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1216                             LLVMValueRef rsrc,
1217                             LLVMValueRef vdata,
1218                             unsigned num_channels,
1219                             LLVMValueRef voffset,
1220                             LLVMValueRef soffset,
1221                             unsigned inst_offset,
1222                             bool glc,
1223                             bool slc,
1224                             bool writeonly_memory,
1225                             bool swizzle_enable_hint)
1226 {
1227         /* Split 3 channel stores, becase LLVM doesn't support 3-channel
1228          * intrinsics. */
1229         if (num_channels == 3) {
1230                 LLVMValueRef v[3], v01;
1231
1232                 for (int i = 0; i < 3; i++) {
1233                         v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1234                                         LLVMConstInt(ctx->i32, i, 0), "");
1235                 }
1236                 v01 = ac_build_gather_values(ctx, v, 2);
1237
1238                 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1239                                             soffset, inst_offset, glc, slc,
1240                                             writeonly_memory, swizzle_enable_hint);
1241                 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1242                                             soffset, inst_offset + 8,
1243                                             glc, slc,
1244                                             writeonly_memory, swizzle_enable_hint);
1245                 return;
1246         }
1247
1248         /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1249          * (voffset is swizzled, but soffset isn't swizzled).
1250          * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1251          */
1252         if (!swizzle_enable_hint) {
1253                 LLVMValueRef offset = soffset;
1254
1255                 if (inst_offset)
1256                         offset = LLVMBuildAdd(ctx->builder, offset,
1257                                               LLVMConstInt(ctx->i32, inst_offset, 0), "");
1258
1259                 if (HAVE_LLVM >= 0x800) {
1260                         ac_build_llvm8_buffer_store_common(ctx, rsrc,
1261                                                            ac_to_float(ctx, vdata),
1262                                                            ctx->i32_0,
1263                                                            voffset, offset,
1264                                                            num_channels,
1265                                                            ctx->f32,
1266                                                            glc, slc,
1267                                                            writeonly_memory,
1268                                                            false, false);
1269                 } else {
1270                         if (voffset)
1271                                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1272
1273                         ac_build_buffer_store_common(ctx, rsrc,
1274                                                      ac_to_float(ctx, vdata),
1275                                                      ctx->i32_0, offset,
1276                                                      num_channels, glc, slc,
1277                                                      writeonly_memory, false);
1278                 }
1279                 return;
1280         }
1281
1282         static const unsigned dfmts[] = {
1283                 V_008F0C_BUF_DATA_FORMAT_32,
1284                 V_008F0C_BUF_DATA_FORMAT_32_32,
1285                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1286                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1287         };
1288         unsigned dfmt = dfmts[num_channels - 1];
1289         unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1290         LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1291
1292         ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1293                                    immoffset, num_channels, dfmt, nfmt, glc,
1294                                    slc, writeonly_memory);
1295 }
1296
1297 static LLVMValueRef
1298 ac_build_buffer_load_common(struct ac_llvm_context *ctx,
1299                             LLVMValueRef rsrc,
1300                             LLVMValueRef vindex,
1301                             LLVMValueRef voffset,
1302                             unsigned num_channels,
1303                             bool glc,
1304                             bool slc,
1305                             bool can_speculate,
1306                             bool use_format)
1307 {
1308         LLVMValueRef args[] = {
1309                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1310                 vindex ? vindex : ctx->i32_0,
1311                 voffset,
1312                 LLVMConstInt(ctx->i1, glc, 0),
1313                 LLVMConstInt(ctx->i1, slc, 0)
1314         };
1315         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1316
1317         LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1318         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1319         char name[256];
1320
1321         if (use_format) {
1322                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1323                          type_names[func]);
1324         } else {
1325                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1326                          type_names[func]);
1327         }
1328
1329         return ac_build_intrinsic(ctx, name, types[func], args,
1330                                   ARRAY_SIZE(args),
1331                                   ac_get_load_intr_attribs(can_speculate));
1332 }
1333
1334 static LLVMValueRef
1335 ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
1336                                   LLVMValueRef rsrc,
1337                                   LLVMValueRef vindex,
1338                                   LLVMValueRef voffset,
1339                                   LLVMValueRef soffset,
1340                                   unsigned num_channels,
1341                                   LLVMTypeRef channel_type,
1342                                   bool glc,
1343                                   bool slc,
1344                                   bool can_speculate,
1345                                   bool use_format,
1346                                   bool structurized)
1347 {
1348         LLVMValueRef args[5];
1349         int idx = 0;
1350         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1351         if (structurized)
1352                 args[idx++] = vindex ? vindex : ctx->i32_0;
1353         args[idx++] = voffset ? voffset : ctx->i32_0;
1354         args[idx++] = soffset ? soffset : ctx->i32_0;
1355         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1356         unsigned func = num_channels == 3 ? 4 : num_channels;
1357         const char *indexing_kind = structurized ? "struct" : "raw";
1358         char name[256], type_name[8];
1359
1360         LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1361         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1362
1363         if (use_format) {
1364                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1365                          indexing_kind, type_name);
1366         } else {
1367                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1368                          indexing_kind, type_name);
1369         }
1370
1371         return ac_build_intrinsic(ctx, name, type, args, idx,
1372                                   ac_get_load_intr_attribs(can_speculate));
1373 }
1374
1375 LLVMValueRef
1376 ac_build_buffer_load(struct ac_llvm_context *ctx,
1377                      LLVMValueRef rsrc,
1378                      int num_channels,
1379                      LLVMValueRef vindex,
1380                      LLVMValueRef voffset,
1381                      LLVMValueRef soffset,
1382                      unsigned inst_offset,
1383                      unsigned glc,
1384                      unsigned slc,
1385                      bool can_speculate,
1386                      bool allow_smem)
1387 {
1388         LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1389         if (voffset)
1390                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1391         if (soffset)
1392                 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1393
1394         if (allow_smem && !slc &&
1395             (!glc || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= GFX8))) {
1396                 assert(vindex == NULL);
1397
1398                 LLVMValueRef result[8];
1399
1400                 for (int i = 0; i < num_channels; i++) {
1401                         if (i) {
1402                                 offset = LLVMBuildAdd(ctx->builder, offset,
1403                                                       LLVMConstInt(ctx->i32, 4, 0), "");
1404                         }
1405                         const char *intrname =
1406                                 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
1407                                                     : "llvm.SI.load.const.v4i32";
1408                         unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
1409                         LLVMValueRef args[3] = {
1410                                 rsrc,
1411                                 offset,
1412                                 glc ? ctx->i32_1 : ctx->i32_0,
1413                         };
1414                         result[i] = ac_build_intrinsic(ctx, intrname,
1415                                                        ctx->f32, args, num_args,
1416                                                        AC_FUNC_ATTR_READNONE |
1417                                                        (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
1418                 }
1419                 if (num_channels == 1)
1420                         return result[0];
1421
1422                 if (num_channels == 3)
1423                         result[num_channels++] = LLVMGetUndef(ctx->f32);
1424                 return ac_build_gather_values(ctx, result, num_channels);
1425         }
1426
1427         if (HAVE_LLVM >= 0x0800) {
1428                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
1429                                                          offset, ctx->i32_0,
1430                                                          num_channels, ctx->f32,
1431                                                          glc, slc,
1432                                                          can_speculate, false,
1433                                                          false);
1434         }
1435
1436         return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
1437                                            num_channels, glc, slc,
1438                                            can_speculate, false);
1439 }
1440
1441 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1442                                          LLVMValueRef rsrc,
1443                                          LLVMValueRef vindex,
1444                                          LLVMValueRef voffset,
1445                                          unsigned num_channels,
1446                                          bool glc,
1447                                          bool can_speculate)
1448 {
1449         if (HAVE_LLVM >= 0x800) {
1450                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1451                                                          num_channels, ctx->f32,
1452                                                          glc, false,
1453                                                          can_speculate, true, true);
1454         }
1455         return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
1456                                            num_channels, glc, false,
1457                                            can_speculate, true);
1458 }
1459
1460 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1461                                                   LLVMValueRef rsrc,
1462                                                   LLVMValueRef vindex,
1463                                                   LLVMValueRef voffset,
1464                                                   unsigned num_channels,
1465                                                   bool glc,
1466                                                   bool can_speculate)
1467 {
1468         if (HAVE_LLVM >= 0x800) {
1469                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1470                                                          num_channels, ctx->f32,
1471                                                          glc, false,
1472                                                          can_speculate, true, true);
1473         }
1474
1475         LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1476         LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
1477         stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1478
1479         LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1480                                                       LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1481                                                       elem_count, stride, "");
1482
1483         LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1484                                                        LLVMConstInt(ctx->i32, 2, 0), "");
1485
1486         return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1487                                            num_channels, glc, false,
1488                                            can_speculate, true);
1489 }
1490
1491 static LLVMValueRef
1492 ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
1493                             LLVMValueRef rsrc,
1494                             LLVMValueRef vindex,
1495                             LLVMValueRef voffset,
1496                             LLVMValueRef soffset,
1497                             unsigned num_channels,
1498                             unsigned dfmt,
1499                             unsigned nfmt,
1500                             bool glc,
1501                             bool slc,
1502                             bool can_speculate,
1503                             bool structurized)
1504 {
1505         LLVMValueRef args[6];
1506         int idx = 0;
1507         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1508         if (structurized)
1509                 args[idx++] = vindex ? vindex : ctx->i32_0;
1510         args[idx++] = voffset ? voffset : ctx->i32_0;
1511         args[idx++] = soffset ? soffset : ctx->i32_0;
1512         args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
1513         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1514         unsigned func = num_channels == 3 ? 4 : num_channels;
1515         const char *indexing_kind = structurized ? "struct" : "raw";
1516         char name[256], type_name[8];
1517
1518         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1519         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1520
1521         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1522                  indexing_kind, type_name);
1523
1524         return ac_build_intrinsic(ctx, name, type, args, idx,
1525                                   ac_get_load_intr_attribs(can_speculate));
1526 }
1527
1528 static LLVMValueRef
1529 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1530                             LLVMValueRef rsrc,
1531                             LLVMValueRef vindex,
1532                             LLVMValueRef voffset,
1533                             LLVMValueRef soffset,
1534                             LLVMValueRef immoffset,
1535                             unsigned num_channels,
1536                             unsigned dfmt,
1537                             unsigned nfmt,
1538                             bool glc,
1539                             bool slc,
1540                             bool can_speculate,
1541                             bool structurized) /* only matters for LLVM 8+ */
1542 {
1543         if (HAVE_LLVM >= 0x800) {
1544                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1545
1546                 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
1547                                                    soffset, num_channels,
1548                                                    dfmt, nfmt, glc, slc,
1549                                                    can_speculate, structurized);
1550         }
1551
1552         LLVMValueRef args[] = {
1553                 rsrc,
1554                 vindex ? vindex : ctx->i32_0,
1555                 voffset,
1556                 soffset,
1557                 immoffset,
1558                 LLVMConstInt(ctx->i32, dfmt, false),
1559                 LLVMConstInt(ctx->i32, nfmt, false),
1560                 LLVMConstInt(ctx->i1, glc, false),
1561                 LLVMConstInt(ctx->i1, slc, false),
1562         };
1563         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1564         LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1565         const char *type_names[] = {"i32", "v2i32", "v4i32"};
1566         char name[256];
1567
1568         snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
1569                  type_names[func]);
1570
1571         return ac_build_intrinsic(ctx, name, types[func], args, 9,
1572                                   ac_get_load_intr_attribs(can_speculate));
1573 }
1574
1575 LLVMValueRef
1576 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1577                              LLVMValueRef rsrc,
1578                              LLVMValueRef vindex,
1579                              LLVMValueRef voffset,
1580                              LLVMValueRef soffset,
1581                              LLVMValueRef immoffset,
1582                              unsigned num_channels,
1583                              unsigned dfmt,
1584                              unsigned nfmt,
1585                              bool glc,
1586                              bool slc,
1587                              bool can_speculate)
1588 {
1589         return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1590                                      immoffset, num_channels, dfmt, nfmt, glc,
1591                                      slc, can_speculate, true);
1592 }
1593
1594 LLVMValueRef
1595 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1596                           LLVMValueRef rsrc,
1597                           LLVMValueRef voffset,
1598                           LLVMValueRef soffset,
1599                           LLVMValueRef immoffset,
1600                           unsigned num_channels,
1601                           unsigned dfmt,
1602                           unsigned nfmt,
1603                           bool glc,
1604                           bool slc,
1605                           bool can_speculate)
1606 {
1607         return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1608                                      immoffset, num_channels, dfmt, nfmt, glc,
1609                                      slc, can_speculate, false);
1610 }
1611
1612 LLVMValueRef
1613 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1614                             LLVMValueRef rsrc,
1615                             LLVMValueRef voffset,
1616                             LLVMValueRef soffset,
1617                             LLVMValueRef immoffset,
1618                             bool glc)
1619 {
1620         LLVMValueRef res;
1621
1622         if (HAVE_LLVM >= 0x900) {
1623                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1624
1625                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1626                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1627                                                         voffset, soffset,
1628                                                         1, ctx->i16, glc, false,
1629                                                         false, false, false);
1630         } else {
1631                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1632                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1633
1634                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1635                                                 immoffset, 1, dfmt, nfmt, glc, false,
1636                                                 false);
1637
1638                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1639         }
1640
1641         return res;
1642 }
1643
1644 LLVMValueRef
1645 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1646                            LLVMValueRef rsrc,
1647                            LLVMValueRef voffset,
1648                            LLVMValueRef soffset,
1649                            LLVMValueRef immoffset,
1650                            bool glc)
1651 {
1652         LLVMValueRef res;
1653
1654         if (HAVE_LLVM >= 0x900) {
1655                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1656
1657                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1658                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1659                                                         voffset, soffset,
1660                                                         1, ctx->i8, glc, false,
1661                                                         false, false, false);
1662         } else {
1663                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1664                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1665
1666                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1667                                                 immoffset, 1, dfmt, nfmt, glc, false,
1668                                                 false);
1669
1670                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1671         }
1672
1673         return res;
1674 }
1675
1676 /**
1677  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1678  *
1679  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1680  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1681  */
1682 static LLVMValueRef
1683 ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits)
1684 {
1685         assert(LLVMTypeOf(src) == ctx->i32);
1686
1687         LLVMValueRef tmp;
1688         LLVMValueRef mantissa;
1689         mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1690
1691         /* Converting normal numbers is just a shift + correcting the exponent bias */
1692         unsigned normal_shift = 23 - mant_bits;
1693         unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1694         LLVMValueRef shifted, normal;
1695
1696         shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1697         normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1698
1699         /* Converting nan/inf numbers is the same, but with a different exponent update */
1700         LLVMValueRef naninf;
1701         naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1702
1703         /* Converting denormals is the complex case: determine the leading zeros of the
1704          * mantissa to obtain the correct shift for the mantissa and exponent correction.
1705          */
1706         LLVMValueRef denormal;
1707         LLVMValueRef params[2] = {
1708                 mantissa,
1709                 ctx->i1true, /* result can be undef when arg is 0 */
1710         };
1711         LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32,
1712                                               params, 2, AC_FUNC_ATTR_READNONE);
1713
1714         /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1715         tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1716         denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1717
1718         unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1719         tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1720         tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1721         denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1722
1723         /* Select the final result. */
1724         LLVMValueRef result;
1725
1726         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1727                             LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1728         result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1729
1730         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1731                             LLVMConstInt(ctx->i32, 1 << mant_bits, false), "");
1732         result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1733
1734         tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1735         result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1736
1737         return ac_to_float(ctx, result);
1738 }
1739
1740 /**
1741  * Generate a fully general open coded buffer format fetch with all required
1742  * fixups suitable for vertex fetch, using non-format buffer loads.
1743  *
1744  * Some combinations of argument values have special interpretations:
1745  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1746  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1747  *
1748  * \param log_size log(size of channel in bytes)
1749  * \param num_channels number of channels (1 to 4)
1750  * \param format AC_FETCH_FORMAT_xxx value
1751  * \param reverse whether XYZ channels are reversed
1752  * \param known_aligned whether the source is known to be aligned to hardware's
1753  *                      effective element size for loading the given format
1754  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1755  * \param rsrc buffer resource descriptor
1756  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1757  */
1758 LLVMValueRef
1759 ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
1760                                unsigned log_size,
1761                                unsigned num_channels,
1762                                unsigned format,
1763                                bool reverse,
1764                                bool known_aligned,
1765                                LLVMValueRef rsrc,
1766                                LLVMValueRef vindex,
1767                                LLVMValueRef voffset,
1768                                LLVMValueRef soffset,
1769                                bool glc,
1770                                bool slc,
1771                                bool can_speculate)
1772 {
1773         LLVMValueRef tmp;
1774         unsigned load_log_size = log_size;
1775         unsigned load_num_channels = num_channels;
1776         if (log_size == 3) {
1777                 load_log_size = 2;
1778                 if (format == AC_FETCH_FORMAT_FLOAT) {
1779                         load_num_channels = 2 * num_channels;
1780                 } else {
1781                         load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1782                 }
1783         }
1784
1785         int log_recombine = 0;
1786         if (ctx->chip_class == GFX6 && !known_aligned) {
1787                 /* Avoid alignment restrictions by loading one byte at a time. */
1788                 load_num_channels <<= load_log_size;
1789                 log_recombine = load_log_size;
1790                 load_log_size = 0;
1791         } else if (load_num_channels == 2 || load_num_channels == 4) {
1792                 log_recombine = -util_logbase2(load_num_channels);
1793                 load_num_channels = 1;
1794                 load_log_size += -log_recombine;
1795         }
1796
1797         assert(load_log_size >= 2 || HAVE_LLVM >= 0x0900);
1798
1799         LLVMValueRef loads[32]; /* up to 32 bytes */
1800         for (unsigned i = 0; i < load_num_channels; ++i) {
1801                 tmp = LLVMBuildAdd(ctx->builder, soffset,
1802                                    LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1803                 if (HAVE_LLVM >= 0x0800) {
1804                         LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 :
1805                                                    load_log_size == 1 ? ctx->i16 : ctx->i32;
1806                         unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1807                         loads[i] = ac_build_llvm8_buffer_load_common(
1808                                         ctx, rsrc, vindex, voffset, tmp,
1809                                         num_channels, channel_type, glc, slc,
1810                                         can_speculate, false, true);
1811                 } else {
1812                         tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, "");
1813                         loads[i] = ac_build_buffer_load_common(
1814                                         ctx, rsrc, vindex, tmp,
1815                                         1 << (load_log_size - 2), glc, slc, can_speculate, false);
1816                 }
1817                 if (load_log_size >= 2)
1818                         loads[i] = ac_to_integer(ctx, loads[i]);
1819         }
1820
1821         if (log_recombine > 0) {
1822                 /* Recombine bytes if necessary (GFX6 only) */
1823                 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1824
1825                 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1826                         LLVMValueRef accum = NULL;
1827                         for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1828                                 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1829                                 if (i == 0) {
1830                                         accum = tmp;
1831                                 } else {
1832                                         tmp = LLVMBuildShl(ctx->builder, tmp,
1833                                                            LLVMConstInt(dst_type, 8 * i, false), "");
1834                                         accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1835                                 }
1836                         }
1837                         loads[dst] = accum;
1838                 }
1839         } else if (log_recombine < 0) {
1840                 /* Split vectors of dwords */
1841                 if (load_log_size > 2) {
1842                         assert(load_num_channels == 1);
1843                         LLVMValueRef loaded = loads[0];
1844                         unsigned log_split = load_log_size - 2;
1845                         log_recombine += log_split;
1846                         load_num_channels = 1 << log_split;
1847                         load_log_size = 2;
1848                         for (unsigned i = 0; i < load_num_channels; ++i) {
1849                                 tmp = LLVMConstInt(ctx->i32, i, false);
1850                                 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1851                         }
1852                 }
1853
1854                 /* Further split dwords and shorts if required */
1855                 if (log_recombine < 0) {
1856                         for (unsigned src = load_num_channels,
1857                                       dst = load_num_channels << -log_recombine;
1858                              src > 0; --src) {
1859                                 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1860                                 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1861                                 LLVMValueRef loaded = loads[src - 1];
1862                                 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1863                                 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1864                                         tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1865                                         tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1866                                         loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1867                                 }
1868                         }
1869                 }
1870         }
1871
1872         if (log_size == 3) {
1873                 if (format == AC_FETCH_FORMAT_FLOAT) {
1874                         for (unsigned i = 0; i < num_channels; ++i) {
1875                                 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1876                                 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1877                         }
1878                 } else if (format == AC_FETCH_FORMAT_FIXED) {
1879                         /* 10_11_11_FLOAT */
1880                         LLVMValueRef data = loads[0];
1881                         LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1882                         LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1883                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1884                         LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1885                         LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1886
1887                         loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1888                         loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1889                         loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1890
1891                         num_channels = 3;
1892                         log_size = 2;
1893                         format = AC_FETCH_FORMAT_FLOAT;
1894                 } else {
1895                         /* 2_10_10_10 data formats */
1896                         LLVMValueRef data = loads[0];
1897                         LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1898                         LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1899                         loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1900                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1901                         loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1902                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1903                         loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1904                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1905                         loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1906
1907                         num_channels = 4;
1908                 }
1909         }
1910
1911         if (format == AC_FETCH_FORMAT_FLOAT) {
1912                 if (log_size != 2) {
1913                         for (unsigned chan = 0; chan < num_channels; ++chan) {
1914                                 tmp = ac_to_float(ctx, loads[chan]);
1915                                 if (log_size == 3)
1916                                         tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1917                                 else if (log_size == 1)
1918                                         tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1919                                 loads[chan] = ac_to_integer(ctx, tmp);
1920                         }
1921                 }
1922         } else if (format == AC_FETCH_FORMAT_UINT) {
1923                 if (log_size != 2) {
1924                         for (unsigned chan = 0; chan < num_channels; ++chan)
1925                                 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1926                 }
1927         } else if (format == AC_FETCH_FORMAT_SINT) {
1928                 if (log_size != 2) {
1929                         for (unsigned chan = 0; chan < num_channels; ++chan)
1930                                 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1931                 }
1932         } else {
1933                 bool unsign = format == AC_FETCH_FORMAT_UNORM ||
1934                               format == AC_FETCH_FORMAT_USCALED ||
1935                               format == AC_FETCH_FORMAT_UINT;
1936
1937                 for (unsigned chan = 0; chan < num_channels; ++chan) {
1938                         if (unsign) {
1939                                 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1940                         } else {
1941                                 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1942                         }
1943
1944                         LLVMValueRef scale = NULL;
1945                         if (format == AC_FETCH_FORMAT_FIXED) {
1946                                 assert(log_size == 2);
1947                                 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1948                         } else if (format == AC_FETCH_FORMAT_UNORM) {
1949                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1950                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1951                         } else if (format == AC_FETCH_FORMAT_SNORM) {
1952                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1953                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1954                         }
1955                         if (scale)
1956                                 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1957
1958                         if (format == AC_FETCH_FORMAT_SNORM) {
1959                                 /* Clamp to [-1, 1] */
1960                                 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1961                                 LLVMValueRef clamp =
1962                                         LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1963                                 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1964                         }
1965
1966                         loads[chan] = ac_to_integer(ctx, tmp);
1967                 }
1968         }
1969
1970         while (num_channels < 4) {
1971                 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1972                         loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1973                 } else {
1974                         loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1975                 }
1976                 num_channels++;
1977         }
1978
1979         if (reverse) {
1980                 tmp = loads[0];
1981                 loads[0] = loads[2];
1982                 loads[2] = tmp;
1983         }
1984
1985         return ac_build_gather_values(ctx, loads, 4);
1986 }
1987
1988 static void
1989 ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
1990                              LLVMValueRef rsrc,
1991                              LLVMValueRef vdata,
1992                              LLVMValueRef vindex,
1993                              LLVMValueRef voffset,
1994                              LLVMValueRef soffset,
1995                              unsigned num_channels,
1996                              unsigned dfmt,
1997                              unsigned nfmt,
1998                              bool glc,
1999                              bool slc,
2000                              bool writeonly_memory,
2001                              bool structurized)
2002 {
2003         LLVMValueRef args[7];
2004         int idx = 0;
2005         args[idx++] = vdata;
2006         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
2007         if (structurized)
2008                 args[idx++] = vindex ? vindex : ctx->i32_0;
2009         args[idx++] = voffset ? voffset : ctx->i32_0;
2010         args[idx++] = soffset ? soffset : ctx->i32_0;
2011         args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
2012         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
2013         unsigned func = num_channels == 3 ? 4 : num_channels;
2014         const char *indexing_kind = structurized ? "struct" : "raw";
2015         char name[256], type_name[8];
2016
2017         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
2018         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
2019
2020         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
2021                  indexing_kind, type_name);
2022
2023         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
2024                            ac_get_store_intr_attribs(writeonly_memory));
2025 }
2026
2027 static void
2028 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
2029                        LLVMValueRef rsrc,
2030                        LLVMValueRef vdata,
2031                        LLVMValueRef vindex,
2032                        LLVMValueRef voffset,
2033                        LLVMValueRef soffset,
2034                        LLVMValueRef immoffset,
2035                        unsigned num_channels,
2036                        unsigned dfmt,
2037                        unsigned nfmt,
2038                        bool glc,
2039                        bool slc,
2040                        bool writeonly_memory,
2041                        bool structurized) /* only matters for LLVM 8+ */
2042 {
2043         if (HAVE_LLVM >= 0x800) {
2044                 voffset = LLVMBuildAdd(ctx->builder,
2045                                        voffset ? voffset : ctx->i32_0,
2046                                        immoffset, "");
2047
2048                 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
2049                                              soffset, num_channels, dfmt, nfmt,
2050                                              glc, slc, writeonly_memory,
2051                                              structurized);
2052         } else {
2053                 LLVMValueRef params[] = {
2054                         vdata,
2055                         rsrc,
2056                         vindex ? vindex : ctx->i32_0,
2057                         voffset ? voffset : ctx->i32_0,
2058                         soffset ? soffset : ctx->i32_0,
2059                         immoffset,
2060                         LLVMConstInt(ctx->i32, dfmt, false),
2061                         LLVMConstInt(ctx->i32, nfmt, false),
2062                         LLVMConstInt(ctx->i1, glc, false),
2063                         LLVMConstInt(ctx->i1, slc, false),
2064                 };
2065                 unsigned func = CLAMP(num_channels, 1, 3) - 1;
2066                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
2067                 char name[256];
2068
2069                 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
2070                          type_names[func]);
2071
2072                 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
2073                                    ac_get_store_intr_attribs(writeonly_memory));
2074         }
2075 }
2076
2077 void
2078 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
2079                               LLVMValueRef rsrc,
2080                               LLVMValueRef vdata,
2081                               LLVMValueRef vindex,
2082                               LLVMValueRef voffset,
2083                               LLVMValueRef soffset,
2084                               LLVMValueRef immoffset,
2085                               unsigned num_channels,
2086                               unsigned dfmt,
2087                               unsigned nfmt,
2088                               bool glc,
2089                               bool slc,
2090                               bool writeonly_memory)
2091 {
2092         ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
2093                                immoffset, num_channels, dfmt, nfmt, glc, slc,
2094                                writeonly_memory, true);
2095 }
2096
2097 void
2098 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
2099                            LLVMValueRef rsrc,
2100                            LLVMValueRef vdata,
2101                            LLVMValueRef voffset,
2102                            LLVMValueRef soffset,
2103                            LLVMValueRef immoffset,
2104                            unsigned num_channels,
2105                            unsigned dfmt,
2106                            unsigned nfmt,
2107                            bool glc,
2108                            bool slc,
2109                            bool writeonly_memory)
2110 {
2111         ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
2112                                immoffset, num_channels, dfmt, nfmt, glc, slc,
2113                                writeonly_memory, false);
2114 }
2115
2116 void
2117 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
2118                              LLVMValueRef rsrc,
2119                              LLVMValueRef vdata,
2120                              LLVMValueRef voffset,
2121                              LLVMValueRef soffset,
2122                              bool glc,
2123                              bool writeonly_memory)
2124 {
2125         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
2126
2127         if (HAVE_LLVM >= 0x900) {
2128                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2129                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2130                                                    voffset, soffset, 1,
2131                                                    ctx->i16, glc, false,
2132                                                    writeonly_memory, false,
2133                                                    false);
2134         } else {
2135                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
2136                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2137
2138                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2139
2140                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2141                                            ctx->i32_0, 1, dfmt, nfmt, glc, false,
2142                                            writeonly_memory);
2143         }
2144 }
2145
2146 void
2147 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
2148                             LLVMValueRef rsrc,
2149                             LLVMValueRef vdata,
2150                             LLVMValueRef voffset,
2151                             LLVMValueRef soffset,
2152                             bool glc,
2153                             bool writeonly_memory)
2154 {
2155         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
2156
2157         if (HAVE_LLVM >= 0x900) {
2158                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2159                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2160                                                    voffset, soffset, 1,
2161                                                    ctx->i8, glc, false,
2162                                                    writeonly_memory, false,
2163                                                    false);
2164         } else {
2165                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
2166                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2167
2168                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2169
2170                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2171                                            ctx->i32_0, 1, dfmt, nfmt, glc, false,
2172                                            writeonly_memory);
2173         }
2174 }
2175 /**
2176  * Set range metadata on an instruction.  This can only be used on load and
2177  * call instructions.  If you know an instruction can only produce the values
2178  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
2179  * \p lo is the minimum value inclusive.
2180  * \p hi is the maximum value exclusive.
2181  */
2182 static void set_range_metadata(struct ac_llvm_context *ctx,
2183                                LLVMValueRef value, unsigned lo, unsigned hi)
2184 {
2185         LLVMValueRef range_md, md_args[2];
2186         LLVMTypeRef type = LLVMTypeOf(value);
2187         LLVMContextRef context = LLVMGetTypeContext(type);
2188
2189         md_args[0] = LLVMConstInt(type, lo, false);
2190         md_args[1] = LLVMConstInt(type, hi, false);
2191         range_md = LLVMMDNodeInContext(context, md_args, 2);
2192         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
2193 }
2194
2195 LLVMValueRef
2196 ac_get_thread_id(struct ac_llvm_context *ctx)
2197 {
2198         LLVMValueRef tid;
2199
2200         LLVMValueRef tid_args[2];
2201         tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
2202         tid_args[1] = ctx->i32_0;
2203         tid_args[1] = ac_build_intrinsic(ctx,
2204                                          "llvm.amdgcn.mbcnt.lo", ctx->i32,
2205                                          tid_args, 2, AC_FUNC_ATTR_READNONE);
2206
2207         tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
2208                                  ctx->i32, tid_args,
2209                                  2, AC_FUNC_ATTR_READNONE);
2210         set_range_metadata(ctx, tid, 0, 64);
2211         return tid;
2212 }
2213
2214 /*
2215  * AMD GCN implements derivatives using the local data store (LDS)
2216  * All writes to the LDS happen in all executing threads at
2217  * the same time. TID is the Thread ID for the current
2218  * thread and is a value between 0 and 63, representing
2219  * the thread's position in the wavefront.
2220  *
2221  * For the pixel shader threads are grouped into quads of four pixels.
2222  * The TIDs of the pixels of a quad are:
2223  *
2224  *  +------+------+
2225  *  |4n + 0|4n + 1|
2226  *  +------+------+
2227  *  |4n + 2|4n + 3|
2228  *  +------+------+
2229  *
2230  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
2231  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
2232  * the current pixel's column, and masking with 0xfffffffe yields the TID
2233  * of the left pixel of the current pixel's row.
2234  *
2235  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
2236  * adding 2 yields the TID of the pixel below the top pixel.
2237  */
2238 LLVMValueRef
2239 ac_build_ddxy(struct ac_llvm_context *ctx,
2240               uint32_t mask,
2241               int idx,
2242               LLVMValueRef val)
2243 {
2244         unsigned tl_lanes[4], trbl_lanes[4];
2245         char name[32], type[8];
2246         LLVMValueRef tl, trbl;
2247         LLVMTypeRef result_type;
2248         LLVMValueRef result;
2249
2250         result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
2251
2252         if (result_type == ctx->f16)
2253                 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
2254
2255         for (unsigned i = 0; i < 4; ++i) {
2256                 tl_lanes[i] = i & mask;
2257                 trbl_lanes[i] = (i & mask) + idx;
2258         }
2259
2260         tl = ac_build_quad_swizzle(ctx, val,
2261                                    tl_lanes[0], tl_lanes[1],
2262                                    tl_lanes[2], tl_lanes[3]);
2263         trbl = ac_build_quad_swizzle(ctx, val,
2264                                      trbl_lanes[0], trbl_lanes[1],
2265                                      trbl_lanes[2], trbl_lanes[3]);
2266
2267         if (result_type == ctx->f16) {
2268                 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
2269                 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
2270         }
2271
2272         tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
2273         trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
2274         result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
2275
2276         ac_build_type_name_for_intr(result_type, type, sizeof(type));
2277         snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
2278
2279         return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
2280 }
2281
2282 void
2283 ac_build_sendmsg(struct ac_llvm_context *ctx,
2284                  uint32_t msg,
2285                  LLVMValueRef wave_id)
2286 {
2287         LLVMValueRef args[2];
2288         args[0] = LLVMConstInt(ctx->i32, msg, false);
2289         args[1] = wave_id;
2290         ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
2291 }
2292
2293 LLVMValueRef
2294 ac_build_imsb(struct ac_llvm_context *ctx,
2295               LLVMValueRef arg,
2296               LLVMTypeRef dst_type)
2297 {
2298         LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
2299                                               dst_type, &arg, 1,
2300                                               AC_FUNC_ATTR_READNONE);
2301
2302         /* The HW returns the last bit index from MSB, but NIR/TGSI wants
2303          * the index from LSB. Invert it by doing "31 - msb". */
2304         msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
2305                            msb, "");
2306
2307         LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
2308         LLVMValueRef cond = LLVMBuildOr(ctx->builder,
2309                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2310                                                       arg, ctx->i32_0, ""),
2311                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2312                                                       arg, all_ones, ""), "");
2313
2314         return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
2315 }
2316
2317 LLVMValueRef
2318 ac_build_umsb(struct ac_llvm_context *ctx,
2319               LLVMValueRef arg,
2320               LLVMTypeRef dst_type)
2321 {
2322         const char *intrin_name;
2323         LLVMTypeRef type;
2324         LLVMValueRef highest_bit;
2325         LLVMValueRef zero;
2326         unsigned bitsize;
2327
2328         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
2329         switch (bitsize) {
2330         case 64:
2331                 intrin_name = "llvm.ctlz.i64";
2332                 type = ctx->i64;
2333                 highest_bit = LLVMConstInt(ctx->i64, 63, false);
2334                 zero = ctx->i64_0;
2335                 break;
2336         case 32:
2337                 intrin_name = "llvm.ctlz.i32";
2338                 type = ctx->i32;
2339                 highest_bit = LLVMConstInt(ctx->i32, 31, false);
2340                 zero = ctx->i32_0;
2341                 break;
2342         case 16:
2343                 intrin_name = "llvm.ctlz.i16";
2344                 type = ctx->i16;
2345                 highest_bit = LLVMConstInt(ctx->i16, 15, false);
2346                 zero = ctx->i16_0;
2347                 break;
2348         case 8:
2349                 intrin_name = "llvm.ctlz.i8";
2350                 type = ctx->i8;
2351                 highest_bit = LLVMConstInt(ctx->i8, 7, false);
2352                 zero = ctx->i8_0;
2353                 break;
2354         default:
2355                 unreachable(!"invalid bitsize");
2356                 break;
2357         }
2358
2359         LLVMValueRef params[2] = {
2360                 arg,
2361                 ctx->i1true,
2362         };
2363
2364         LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
2365                                               params, 2,
2366                                               AC_FUNC_ATTR_READNONE);
2367
2368         /* The HW returns the last bit index from MSB, but TGSI/NIR wants
2369          * the index from LSB. Invert it by doing "31 - msb". */
2370         msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
2371
2372         if (bitsize == 64) {
2373                 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
2374         } else if (bitsize < 32) {
2375                 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
2376         }
2377
2378         /* check for zero */
2379         return LLVMBuildSelect(ctx->builder,
2380                                LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
2381                                LLVMConstInt(ctx->i32, -1, true), msb, "");
2382 }
2383
2384 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
2385                            LLVMValueRef b)
2386 {
2387         char name[64];
2388         snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2389         LLVMValueRef args[2] = {a, b};
2390         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2391                                   AC_FUNC_ATTR_READNONE);
2392 }
2393
2394 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
2395                            LLVMValueRef b)
2396 {
2397         char name[64];
2398         snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2399         LLVMValueRef args[2] = {a, b};
2400         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2401                                   AC_FUNC_ATTR_READNONE);
2402 }
2403
2404 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2405                            LLVMValueRef b)
2406 {
2407         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2408         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2409 }
2410
2411 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2412                            LLVMValueRef b)
2413 {
2414         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2415         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2416 }
2417
2418 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2419                            LLVMValueRef b)
2420 {
2421         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2422         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2423 }
2424
2425 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
2426                            LLVMValueRef b)
2427 {
2428         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
2429         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2430 }
2431
2432 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2433 {
2434         LLVMTypeRef t = LLVMTypeOf(value);
2435         return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2436                              LLVMConstReal(t, 1.0));
2437 }
2438
2439 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2440 {
2441         LLVMValueRef args[9];
2442
2443         args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2444         args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2445
2446         if (a->compr) {
2447                 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2448                 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2449
2450                 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2451                                 v2i16, "");
2452                 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2453                                 v2i16, "");
2454                 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2455                 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2456
2457                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2458                                    ctx->voidt, args, 6, 0);
2459         } else {
2460                 args[2] = a->out[0];
2461                 args[3] = a->out[1];
2462                 args[4] = a->out[2];
2463                 args[5] = a->out[3];
2464                 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2465                 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2466
2467                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2468                                    ctx->voidt, args, 8, 0);
2469         }
2470 }
2471
2472 void ac_build_export_null(struct ac_llvm_context *ctx)
2473 {
2474         struct ac_export_args args;
2475
2476         args.enabled_channels = 0x0; /* enabled channels */
2477         args.valid_mask = 1; /* whether the EXEC mask is valid */
2478         args.done = 1; /* DONE bit */
2479         args.target = V_008DFC_SQ_EXP_NULL;
2480         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2481         args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2482         args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2483         args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2484         args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2485
2486         ac_build_export(ctx, &args);
2487 }
2488
2489 static unsigned ac_num_coords(enum ac_image_dim dim)
2490 {
2491         switch (dim) {
2492         case ac_image_1d:
2493                 return 1;
2494         case ac_image_2d:
2495         case ac_image_1darray:
2496                  return 2;
2497         case ac_image_3d:
2498         case ac_image_cube:
2499         case ac_image_2darray:
2500         case ac_image_2dmsaa:
2501                 return 3;
2502         case ac_image_2darraymsaa:
2503                 return 4;
2504         default:
2505                 unreachable("ac_num_coords: bad dim");
2506         }
2507 }
2508
2509 static unsigned ac_num_derivs(enum ac_image_dim dim)
2510 {
2511         switch (dim) {
2512         case ac_image_1d:
2513         case ac_image_1darray:
2514                 return 2;
2515         case ac_image_2d:
2516         case ac_image_2darray:
2517         case ac_image_cube:
2518                 return 4;
2519         case ac_image_3d:
2520                 return 6;
2521         case ac_image_2dmsaa:
2522         case ac_image_2darraymsaa:
2523         default:
2524                 unreachable("derivatives not supported");
2525         }
2526 }
2527
2528 static const char *get_atomic_name(enum ac_atomic_op op)
2529 {
2530         switch (op) {
2531         case ac_atomic_swap: return "swap";
2532         case ac_atomic_add: return "add";
2533         case ac_atomic_sub: return "sub";
2534         case ac_atomic_smin: return "smin";
2535         case ac_atomic_umin: return "umin";
2536         case ac_atomic_smax: return "smax";
2537         case ac_atomic_umax: return "umax";
2538         case ac_atomic_and: return "and";
2539         case ac_atomic_or: return "or";
2540         case ac_atomic_xor: return "xor";
2541         }
2542         unreachable("bad atomic op");
2543 }
2544
2545 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2546                                    struct ac_image_args *a)
2547 {
2548         const char *overload[3] = { "", "", "" };
2549         unsigned num_overloads = 0;
2550         LLVMValueRef args[18];
2551         unsigned num_args = 0;
2552         enum ac_image_dim dim = a->dim;
2553
2554         assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2555                !a->level_zero);
2556         assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2557                 a->opcode != ac_image_store_mip) ||
2558                a->lod);
2559         assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2560                (!a->compare && !a->offset));
2561         assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2562                 a->opcode == ac_image_get_lod) ||
2563                !a->bias);
2564         assert((a->bias ? 1 : 0) +
2565                (a->lod ? 1 : 0) +
2566                (a->level_zero ? 1 : 0) +
2567                (a->derivs[0] ? 1 : 0) <= 1);
2568
2569         if (a->opcode == ac_image_get_lod) {
2570                 switch (dim) {
2571                 case ac_image_1darray:
2572                         dim = ac_image_1d;
2573                         break;
2574                 case ac_image_2darray:
2575                 case ac_image_cube:
2576                         dim = ac_image_2d;
2577                         break;
2578                 default:
2579                         break;
2580                 }
2581         }
2582
2583         bool sample = a->opcode == ac_image_sample ||
2584                       a->opcode == ac_image_gather4 ||
2585                       a->opcode == ac_image_get_lod;
2586         bool atomic = a->opcode == ac_image_atomic ||
2587                       a->opcode == ac_image_atomic_cmpswap;
2588         LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2589
2590         if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2591                 args[num_args++] = a->data[0];
2592                 if (a->opcode == ac_image_atomic_cmpswap)
2593                         args[num_args++] = a->data[1];
2594         }
2595
2596         if (!atomic)
2597                 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2598
2599         if (a->offset)
2600                 args[num_args++] = ac_to_integer(ctx, a->offset);
2601         if (a->bias) {
2602                 args[num_args++] = ac_to_float(ctx, a->bias);
2603                 overload[num_overloads++] = ".f32";
2604         }
2605         if (a->compare)
2606                 args[num_args++] = ac_to_float(ctx, a->compare);
2607         if (a->derivs[0]) {
2608                 unsigned count = ac_num_derivs(dim);
2609                 for (unsigned i = 0; i < count; ++i)
2610                         args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2611                 overload[num_overloads++] = ".f32";
2612         }
2613         unsigned num_coords =
2614                 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2615         for (unsigned i = 0; i < num_coords; ++i)
2616                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2617         if (a->lod)
2618                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2619         overload[num_overloads++] = sample ? ".f32" : ".i32";
2620
2621         args[num_args++] = a->resource;
2622         if (sample) {
2623                 args[num_args++] = a->sampler;
2624                 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2625         }
2626
2627         args[num_args++] = ctx->i32_0; /* texfailctrl */
2628         args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
2629
2630         const char *name;
2631         const char *atomic_subop = "";
2632         switch (a->opcode) {
2633         case ac_image_sample: name = "sample"; break;
2634         case ac_image_gather4: name = "gather4"; break;
2635         case ac_image_load: name = "load"; break;
2636         case ac_image_load_mip: name = "load.mip"; break;
2637         case ac_image_store: name = "store"; break;
2638         case ac_image_store_mip: name = "store.mip"; break;
2639         case ac_image_atomic:
2640                 name = "atomic.";
2641                 atomic_subop = get_atomic_name(a->atomic);
2642                 break;
2643         case ac_image_atomic_cmpswap:
2644                 name = "atomic.";
2645                 atomic_subop = "cmpswap";
2646                 break;
2647         case ac_image_get_lod: name = "getlod"; break;
2648         case ac_image_get_resinfo: name = "getresinfo"; break;
2649         default: unreachable("invalid image opcode");
2650         }
2651
2652         const char *dimname;
2653         switch (dim) {
2654         case ac_image_1d: dimname = "1d"; break;
2655         case ac_image_2d: dimname = "2d"; break;
2656         case ac_image_3d: dimname = "3d"; break;
2657         case ac_image_cube: dimname = "cube"; break;
2658         case ac_image_1darray: dimname = "1darray"; break;
2659         case ac_image_2darray: dimname = "2darray"; break;
2660         case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2661         case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2662         default: unreachable("invalid dim");
2663         }
2664
2665         bool lod_suffix =
2666                 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2667         char intr_name[96];
2668         snprintf(intr_name, sizeof(intr_name),
2669                  "llvm.amdgcn.image.%s%s" /* base name */
2670                  "%s%s%s" /* sample/gather modifiers */
2671                  ".%s.%s%s%s%s", /* dimension and type overloads */
2672                  name, atomic_subop,
2673                  a->compare ? ".c" : "",
2674                  a->bias ? ".b" :
2675                  lod_suffix ? ".l" :
2676                  a->derivs[0] ? ".d" :
2677                  a->level_zero ? ".lz" : "",
2678                  a->offset ? ".o" : "",
2679                  dimname,
2680                  atomic ? "i32" : "v4f32",
2681                  overload[0], overload[1], overload[2]);
2682
2683         LLVMTypeRef retty;
2684         if (atomic)
2685                 retty = ctx->i32;
2686         else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2687                 retty = ctx->voidt;
2688         else
2689                 retty = ctx->v4f32;
2690
2691         LLVMValueRef result =
2692                 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2693                                    a->attributes);
2694         if (!sample && retty == ctx->v4f32) {
2695                 result = LLVMBuildBitCast(ctx->builder, result,
2696                                           ctx->v4i32, "");
2697         }
2698         return result;
2699 }
2700
2701 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2702                                     LLVMValueRef args[2])
2703 {
2704         LLVMTypeRef v2f16 =
2705                 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2706
2707         return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2708                                   args, 2, AC_FUNC_ATTR_READNONE);
2709 }
2710
2711 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2712                                      LLVMValueRef args[2])
2713 {
2714         LLVMValueRef res =
2715                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2716                                    ctx->v2i16, args, 2,
2717                                    AC_FUNC_ATTR_READNONE);
2718         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2719 }
2720
2721 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2722                                      LLVMValueRef args[2])
2723 {
2724         LLVMValueRef res =
2725                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2726                                    ctx->v2i16, args, 2,
2727                                    AC_FUNC_ATTR_READNONE);
2728         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2729 }
2730
2731 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2732 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2733                                  LLVMValueRef args[2], unsigned bits, bool hi)
2734 {
2735         assert(bits == 8 || bits == 10 || bits == 16);
2736
2737         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2738                 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2739         LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2740                 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2741         LLVMValueRef max_alpha =
2742                 bits != 10 ? max_rgb : ctx->i32_1;
2743         LLVMValueRef min_alpha =
2744                 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2745
2746         /* Clamp. */
2747         if (bits != 16) {
2748                 for (int i = 0; i < 2; i++) {
2749                         bool alpha = hi && i == 1;
2750                         args[i] = ac_build_imin(ctx, args[i],
2751                                                 alpha ? max_alpha : max_rgb);
2752                         args[i] = ac_build_imax(ctx, args[i],
2753                                                 alpha ? min_alpha : min_rgb);
2754                 }
2755         }
2756
2757         LLVMValueRef res =
2758                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2759                                    ctx->v2i16, args, 2,
2760                                    AC_FUNC_ATTR_READNONE);
2761         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2762 }
2763
2764 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2765 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2766                                  LLVMValueRef args[2], unsigned bits, bool hi)
2767 {
2768         assert(bits == 8 || bits == 10 || bits == 16);
2769
2770         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2771                 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2772         LLVMValueRef max_alpha =
2773                 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2774
2775         /* Clamp. */
2776         if (bits != 16) {
2777                 for (int i = 0; i < 2; i++) {
2778                         bool alpha = hi && i == 1;
2779                         args[i] = ac_build_umin(ctx, args[i],
2780                                                 alpha ? max_alpha : max_rgb);
2781                 }
2782         }
2783
2784         LLVMValueRef res =
2785                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2786                                    ctx->v2i16, args, 2,
2787                                    AC_FUNC_ATTR_READNONE);
2788         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2789 }
2790
2791 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2792 {
2793         return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2794                                   &i1, 1, AC_FUNC_ATTR_READNONE);
2795 }
2796
2797 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2798 {
2799         ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2800                            &i1, 1, 0);
2801 }
2802
2803 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2804                           LLVMValueRef offset, LLVMValueRef width,
2805                           bool is_signed)
2806 {
2807         LLVMValueRef args[] = {
2808                 input,
2809                 offset,
2810                 width,
2811         };
2812
2813         return ac_build_intrinsic(ctx,
2814                                   is_signed ? "llvm.amdgcn.sbfe.i32" :
2815                                               "llvm.amdgcn.ubfe.i32",
2816                                   ctx->i32, args, 3,
2817                                   AC_FUNC_ATTR_READNONE);
2818 }
2819
2820 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2821                            LLVMValueRef s1, LLVMValueRef s2)
2822 {
2823         return LLVMBuildAdd(ctx->builder,
2824                             LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2825 }
2826
2827 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2828                            LLVMValueRef s1, LLVMValueRef s2)
2829 {
2830         return LLVMBuildFAdd(ctx->builder,
2831                              LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2832 }
2833
2834 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
2835 {
2836         LLVMValueRef args[1] = {
2837                 LLVMConstInt(ctx->i32, simm16, false),
2838         };
2839         ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2840                            ctx->voidt, args, 1, 0);
2841 }
2842
2843 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
2844                             LLVMValueRef src1, LLVMValueRef src2,
2845                             unsigned bitsize)
2846 {
2847         LLVMTypeRef type;
2848         char *intr;
2849
2850         if (bitsize == 16) {
2851                 intr = "llvm.amdgcn.fmed3.f16";
2852                 type = ctx->f16;
2853         } else if (bitsize == 32) {
2854                 intr = "llvm.amdgcn.fmed3.f32";
2855                 type = ctx->f32;
2856         } else {
2857                 intr = "llvm.amdgcn.fmed3.f64";
2858                 type = ctx->f64;
2859         }
2860
2861         LLVMValueRef params[] = {
2862                 src0,
2863                 src1,
2864                 src2,
2865         };
2866         return ac_build_intrinsic(ctx, intr, type, params, 3,
2867                                   AC_FUNC_ATTR_READNONE);
2868 }
2869
2870 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2871                             unsigned bitsize)
2872 {
2873         LLVMTypeRef type;
2874         char *intr;
2875
2876         if (bitsize == 16) {
2877                 intr = "llvm.amdgcn.fract.f16";
2878                 type = ctx->f16;
2879         } else if (bitsize == 32) {
2880                 intr = "llvm.amdgcn.fract.f32";
2881                 type = ctx->f32;
2882         } else {
2883                 intr = "llvm.amdgcn.fract.f64";
2884                 type = ctx->f64;
2885         }
2886
2887         LLVMValueRef params[] = {
2888                 src0,
2889         };
2890         return ac_build_intrinsic(ctx, intr, type, params, 1,
2891                                   AC_FUNC_ATTR_READNONE);
2892 }
2893
2894 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2895                             unsigned bitsize)
2896 {
2897         LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2898         LLVMValueRef zero = LLVMConstInt(type, 0, false);
2899         LLVMValueRef one = LLVMConstInt(type, 1, false);
2900
2901         LLVMValueRef cmp, val;
2902         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2903         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2904         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2905         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2906         return val;
2907 }
2908
2909 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2910                             unsigned bitsize)
2911 {
2912         LLVMValueRef cmp, val, zero, one;
2913         LLVMTypeRef type;
2914
2915         if (bitsize == 16) {
2916                 type = ctx->f16;
2917                 zero = ctx->f16_0;
2918                 one = ctx->f16_1;
2919         } else if (bitsize == 32) {
2920                 type = ctx->f32;
2921                 zero = ctx->f32_0;
2922                 one = ctx->f32_1;
2923         } else {
2924                 type = ctx->f64;
2925                 zero = ctx->f64_0;
2926                 one = ctx->f64_1;
2927         }
2928
2929         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
2930         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2931         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
2932         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
2933         return val;
2934 }
2935
2936 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2937 {
2938         LLVMValueRef result;
2939         unsigned bitsize;
2940
2941         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2942
2943         switch (bitsize) {
2944         case 64:
2945                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
2946                                             (LLVMValueRef []) { src0 }, 1,
2947                                             AC_FUNC_ATTR_READNONE);
2948
2949                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2950                 break;
2951         case 32:
2952                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
2953                                             (LLVMValueRef []) { src0 }, 1,
2954                                             AC_FUNC_ATTR_READNONE);
2955                 break;
2956         case 16:
2957                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
2958                                             (LLVMValueRef []) { src0 }, 1,
2959                                             AC_FUNC_ATTR_READNONE);
2960
2961                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2962                 break;
2963         case 8:
2964                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
2965                                             (LLVMValueRef []) { src0 }, 1,
2966                                             AC_FUNC_ATTR_READNONE);
2967
2968                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2969                 break;
2970         default:
2971                 unreachable(!"invalid bitsize");
2972                 break;
2973         }
2974
2975         return result;
2976 }
2977
2978 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
2979                                        LLVMValueRef src0)
2980 {
2981         LLVMValueRef result;
2982         unsigned bitsize;
2983
2984         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2985
2986         switch (bitsize) {
2987         case 64:
2988                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
2989                                             (LLVMValueRef []) { src0 }, 1,
2990                                             AC_FUNC_ATTR_READNONE);
2991
2992                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2993                 break;
2994         case 32:
2995                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
2996                                             (LLVMValueRef []) { src0 }, 1,
2997                                             AC_FUNC_ATTR_READNONE);
2998                 break;
2999         case 16:
3000                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
3001                                             (LLVMValueRef []) { src0 }, 1,
3002                                             AC_FUNC_ATTR_READNONE);
3003
3004                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3005                 break;
3006         case 8:
3007                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
3008                                             (LLVMValueRef []) { src0 }, 1,
3009                                             AC_FUNC_ATTR_READNONE);
3010
3011                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3012                 break;
3013         default:
3014                 unreachable(!"invalid bitsize");
3015                 break;
3016         }
3017
3018         return result;
3019 }
3020
3021 #define AC_EXP_TARGET           0
3022 #define AC_EXP_ENABLED_CHANNELS 1
3023 #define AC_EXP_OUT0             2
3024
3025 enum ac_ir_type {
3026         AC_IR_UNDEF,
3027         AC_IR_CONST,
3028         AC_IR_VALUE,
3029 };
3030
3031 struct ac_vs_exp_chan
3032 {
3033         LLVMValueRef value;
3034         float const_float;
3035         enum ac_ir_type type;
3036 };
3037
3038 struct ac_vs_exp_inst {
3039         unsigned offset;
3040         LLVMValueRef inst;
3041         struct ac_vs_exp_chan chan[4];
3042 };
3043
3044 struct ac_vs_exports {
3045         unsigned num;
3046         struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
3047 };
3048
3049 /* Return true if the PARAM export has been eliminated. */
3050 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
3051                                       uint32_t num_outputs,
3052                                       struct ac_vs_exp_inst *exp)
3053 {
3054         unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
3055         bool is_zero[4] = {}, is_one[4] = {};
3056
3057         for (i = 0; i < 4; i++) {
3058                 /* It's a constant expression. Undef outputs are eliminated too. */
3059                 if (exp->chan[i].type == AC_IR_UNDEF) {
3060                         is_zero[i] = true;
3061                         is_one[i] = true;
3062                 } else if (exp->chan[i].type == AC_IR_CONST) {
3063                         if (exp->chan[i].const_float == 0)
3064                                 is_zero[i] = true;
3065                         else if (exp->chan[i].const_float == 1)
3066                                 is_one[i] = true;
3067                         else
3068                                 return false; /* other constant */
3069                 } else
3070                         return false;
3071         }
3072
3073         /* Only certain combinations of 0 and 1 can be eliminated. */
3074         if (is_zero[0] && is_zero[1] && is_zero[2])
3075                 default_val = is_zero[3] ? 0 : 1;
3076         else if (is_one[0] && is_one[1] && is_one[2])
3077                 default_val = is_zero[3] ? 2 : 3;
3078         else
3079                 return false;
3080
3081         /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
3082         LLVMInstructionEraseFromParent(exp->inst);
3083
3084         /* Change OFFSET to DEFAULT_VAL. */
3085         for (i = 0; i < num_outputs; i++) {
3086                 if (vs_output_param_offset[i] == exp->offset) {
3087                         vs_output_param_offset[i] =
3088                                 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
3089                         break;
3090                 }
3091         }
3092         return true;
3093 }
3094
3095 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
3096                                            uint8_t *vs_output_param_offset,
3097                                            uint32_t num_outputs,
3098                                            struct ac_vs_exports *processed,
3099                                            struct ac_vs_exp_inst *exp)
3100 {
3101         unsigned p, copy_back_channels = 0;
3102
3103         /* See if the output is already in the list of processed outputs.
3104          * The LLVMValueRef comparison relies on SSA.
3105          */
3106         for (p = 0; p < processed->num; p++) {
3107                 bool different = false;
3108
3109                 for (unsigned j = 0; j < 4; j++) {
3110                         struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
3111                         struct ac_vs_exp_chan *c2 = &exp->chan[j];
3112
3113                         /* Treat undef as a match. */
3114                         if (c2->type == AC_IR_UNDEF)
3115                                 continue;
3116
3117                         /* If c1 is undef but c2 isn't, we can copy c2 to c1
3118                          * and consider the instruction duplicated.
3119                          */
3120                         if (c1->type == AC_IR_UNDEF) {
3121                                 copy_back_channels |= 1 << j;
3122                                 continue;
3123                         }
3124
3125                         /* Test whether the channels are not equal. */
3126                         if (c1->type != c2->type ||
3127                             (c1->type == AC_IR_CONST &&
3128                              c1->const_float != c2->const_float) ||
3129                             (c1->type == AC_IR_VALUE &&
3130                              c1->value != c2->value)) {
3131                                 different = true;
3132                                 break;
3133                         }
3134                 }
3135                 if (!different)
3136                         break;
3137
3138                 copy_back_channels = 0;
3139         }
3140         if (p == processed->num)
3141                 return false;
3142
3143         /* If a match was found, but the matching export has undef where the new
3144          * one has a normal value, copy the normal value to the undef channel.
3145          */
3146         struct ac_vs_exp_inst *match = &processed->exp[p];
3147
3148         /* Get current enabled channels mask. */
3149         LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
3150         unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
3151
3152         while (copy_back_channels) {
3153                 unsigned chan = u_bit_scan(&copy_back_channels);
3154
3155                 assert(match->chan[chan].type == AC_IR_UNDEF);
3156                 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
3157                                exp->chan[chan].value);
3158                 match->chan[chan] = exp->chan[chan];
3159
3160                 /* Update number of enabled channels because the original mask
3161                  * is not always 0xf.
3162                  */
3163                 enabled_channels |= (1 << chan);
3164                 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
3165                                LLVMConstInt(ctx->i32, enabled_channels, 0));
3166         }
3167
3168         /* The PARAM export is duplicated. Kill it. */
3169         LLVMInstructionEraseFromParent(exp->inst);
3170
3171         /* Change OFFSET to the matching export. */
3172         for (unsigned i = 0; i < num_outputs; i++) {
3173                 if (vs_output_param_offset[i] == exp->offset) {
3174                         vs_output_param_offset[i] = match->offset;
3175                         break;
3176                 }
3177         }
3178         return true;
3179 }
3180
3181 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
3182                             LLVMValueRef main_fn,
3183                             uint8_t *vs_output_param_offset,
3184                             uint32_t num_outputs,
3185                             uint8_t *num_param_exports)
3186 {
3187         LLVMBasicBlockRef bb;
3188         bool removed_any = false;
3189         struct ac_vs_exports exports;
3190
3191         exports.num = 0;
3192
3193         /* Process all LLVM instructions. */
3194         bb = LLVMGetFirstBasicBlock(main_fn);
3195         while (bb) {
3196                 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
3197
3198                 while (inst) {
3199                         LLVMValueRef cur = inst;
3200                         inst = LLVMGetNextInstruction(inst);
3201                         struct ac_vs_exp_inst exp;
3202
3203                         if (LLVMGetInstructionOpcode(cur) != LLVMCall)
3204                                 continue;
3205
3206                         LLVMValueRef callee = ac_llvm_get_called_value(cur);
3207
3208                         if (!ac_llvm_is_function(callee))
3209                                 continue;
3210
3211                         const char *name = LLVMGetValueName(callee);
3212                         unsigned num_args = LLVMCountParams(callee);
3213
3214                         /* Check if this is an export instruction. */
3215                         if ((num_args != 9 && num_args != 8) ||
3216                             (strcmp(name, "llvm.SI.export") &&
3217                              strcmp(name, "llvm.amdgcn.exp.f32")))
3218                                 continue;
3219
3220                         LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
3221                         unsigned target = LLVMConstIntGetZExtValue(arg);
3222
3223                         if (target < V_008DFC_SQ_EXP_PARAM)
3224                                 continue;
3225
3226                         target -= V_008DFC_SQ_EXP_PARAM;
3227
3228                         /* Parse the instruction. */
3229                         memset(&exp, 0, sizeof(exp));
3230                         exp.offset = target;
3231                         exp.inst = cur;
3232
3233                         for (unsigned i = 0; i < 4; i++) {
3234                                 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
3235
3236                                 exp.chan[i].value = v;
3237
3238                                 if (LLVMIsUndef(v)) {
3239                                         exp.chan[i].type = AC_IR_UNDEF;
3240                                 } else if (LLVMIsAConstantFP(v)) {
3241                                         LLVMBool loses_info;
3242                                         exp.chan[i].type = AC_IR_CONST;
3243                                         exp.chan[i].const_float =
3244                                                 LLVMConstRealGetDouble(v, &loses_info);
3245                                 } else {
3246                                         exp.chan[i].type = AC_IR_VALUE;
3247                                 }
3248                         }
3249
3250                         /* Eliminate constant and duplicated PARAM exports. */
3251                         if (ac_eliminate_const_output(vs_output_param_offset,
3252                                                       num_outputs, &exp) ||
3253                             ac_eliminate_duplicated_output(ctx,
3254                                                            vs_output_param_offset,
3255                                                            num_outputs, &exports,
3256                                                            &exp)) {
3257                                 removed_any = true;
3258                         } else {
3259                                 exports.exp[exports.num++] = exp;
3260                         }
3261                 }
3262                 bb = LLVMGetNextBasicBlock(bb);
3263         }
3264
3265         /* Remove holes in export memory due to removed PARAM exports.
3266          * This is done by renumbering all PARAM exports.
3267          */
3268         if (removed_any) {
3269                 uint8_t old_offset[VARYING_SLOT_MAX];
3270                 unsigned out, i;
3271
3272                 /* Make a copy of the offsets. We need the old version while
3273                  * we are modifying some of them. */
3274                 memcpy(old_offset, vs_output_param_offset,
3275                        sizeof(old_offset));
3276
3277                 for (i = 0; i < exports.num; i++) {
3278                         unsigned offset = exports.exp[i].offset;
3279
3280                         /* Update vs_output_param_offset. Multiple outputs can
3281                          * have the same offset.
3282                          */
3283                         for (out = 0; out < num_outputs; out++) {
3284                                 if (old_offset[out] == offset)
3285                                         vs_output_param_offset[out] = i;
3286                         }
3287
3288                         /* Change the PARAM offset in the instruction. */
3289                         LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
3290                                        LLVMConstInt(ctx->i32,
3291                                                     V_008DFC_SQ_EXP_PARAM + i, 0));
3292                 }
3293                 *num_param_exports = exports.num;
3294         }
3295 }
3296
3297 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
3298 {
3299         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
3300         ac_build_intrinsic(ctx,
3301                            "llvm.amdgcn.init.exec", ctx->voidt,
3302                            &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
3303 }
3304
3305 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
3306 {
3307         unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
3308         ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
3309                                      LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
3310                                      "lds");
3311 }
3312
3313 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
3314                          LLVMValueRef dw_addr)
3315 {
3316         return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
3317 }
3318
3319 void ac_lds_store(struct ac_llvm_context *ctx,
3320                   LLVMValueRef dw_addr,
3321                   LLVMValueRef value)
3322 {
3323         value = ac_to_integer(ctx, value);
3324         ac_build_indexed_store(ctx, ctx->lds,
3325                                dw_addr, value);
3326 }
3327
3328 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
3329                          LLVMTypeRef dst_type,
3330                          LLVMValueRef src0)
3331 {
3332         unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3333         const char *intrin_name;
3334         LLVMTypeRef type;
3335         LLVMValueRef zero;
3336
3337         switch (src0_bitsize) {
3338         case 64:
3339                 intrin_name = "llvm.cttz.i64";
3340                 type = ctx->i64;
3341                 zero = ctx->i64_0;
3342                 break;
3343         case 32:
3344                 intrin_name = "llvm.cttz.i32";
3345                 type = ctx->i32;
3346                 zero = ctx->i32_0;
3347                 break;
3348         case 16:
3349                 intrin_name = "llvm.cttz.i16";
3350                 type = ctx->i16;
3351                 zero = ctx->i16_0;
3352                 break;
3353         case 8:
3354                 intrin_name = "llvm.cttz.i8";
3355                 type = ctx->i8;
3356                 zero = ctx->i8_0;
3357                 break;
3358         default:
3359                 unreachable(!"invalid bitsize");
3360         }
3361
3362         LLVMValueRef params[2] = {
3363                 src0,
3364
3365                 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3366                  * add special code to check for x=0. The reason is that
3367                  * the LLVM behavior for x=0 is different from what we
3368                  * need here. However, LLVM also assumes that ffs(x) is
3369                  * in [0, 31], but GLSL expects that ffs(0) = -1, so
3370                  * a conditional assignment to handle 0 is still required.
3371                  *
3372                  * The hardware already implements the correct behavior.
3373                  */
3374                 ctx->i1true,
3375         };
3376
3377         LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
3378                                               params, 2,
3379                                               AC_FUNC_ATTR_READNONE);
3380
3381         if (src0_bitsize == 64) {
3382                 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3383         } else if (src0_bitsize < 32) {
3384                 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3385         }
3386
3387         /* TODO: We need an intrinsic to skip this conditional. */
3388         /* Check for zero: */
3389         return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
3390                                                            LLVMIntEQ, src0,
3391                                                            zero, ""),
3392                                LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3393 }
3394
3395 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3396 {
3397         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3398 }
3399
3400 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3401 {
3402         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3403 }
3404
3405 static struct ac_llvm_flow *
3406 get_current_flow(struct ac_llvm_context *ctx)
3407 {
3408         if (ctx->flow_depth > 0)
3409                 return &ctx->flow[ctx->flow_depth - 1];
3410         return NULL;
3411 }
3412
3413 static struct ac_llvm_flow *
3414 get_innermost_loop(struct ac_llvm_context *ctx)
3415 {
3416         for (unsigned i = ctx->flow_depth; i > 0; --i) {
3417                 if (ctx->flow[i - 1].loop_entry_block)
3418                         return &ctx->flow[i - 1];
3419         }
3420         return NULL;
3421 }
3422
3423 static struct ac_llvm_flow *
3424 push_flow(struct ac_llvm_context *ctx)
3425 {
3426         struct ac_llvm_flow *flow;
3427
3428         if (ctx->flow_depth >= ctx->flow_depth_max) {
3429                 unsigned new_max = MAX2(ctx->flow_depth << 1,
3430                                         AC_LLVM_INITIAL_CF_DEPTH);
3431
3432                 ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
3433                 ctx->flow_depth_max = new_max;
3434         }
3435
3436         flow = &ctx->flow[ctx->flow_depth];
3437         ctx->flow_depth++;
3438
3439         flow->next_block = NULL;
3440         flow->loop_entry_block = NULL;
3441         return flow;
3442 }
3443
3444 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
3445                                 int label_id)
3446 {
3447         char buf[32];
3448         snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3449         LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3450 }
3451
3452 /* Append a basic block at the level of the parent flow.
3453  */
3454 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
3455                                             const char *name)
3456 {
3457         assert(ctx->flow_depth >= 1);
3458
3459         if (ctx->flow_depth >= 2) {
3460                 struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
3461
3462                 return LLVMInsertBasicBlockInContext(ctx->context,
3463                                                      flow->next_block, name);
3464         }
3465
3466         LLVMValueRef main_fn =
3467                 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3468         return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3469 }
3470
3471 /* Emit a branch to the given default target for the current block if
3472  * applicable -- that is, if the current block does not already contain a
3473  * branch from a break or continue.
3474  */
3475 static void emit_default_branch(LLVMBuilderRef builder,
3476                                 LLVMBasicBlockRef target)
3477 {
3478         if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3479                  LLVMBuildBr(builder, target);
3480 }
3481
3482 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3483 {
3484         struct ac_llvm_flow *flow = push_flow(ctx);
3485         flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3486         flow->next_block = append_basic_block(ctx, "ENDLOOP");
3487         set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3488         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3489         LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3490 }
3491
3492 void ac_build_break(struct ac_llvm_context *ctx)
3493 {
3494         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3495         LLVMBuildBr(ctx->builder, flow->next_block);
3496 }
3497
3498 void ac_build_continue(struct ac_llvm_context *ctx)
3499 {
3500         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3501         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3502 }
3503
3504 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3505 {
3506         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3507         LLVMBasicBlockRef endif_block;
3508
3509         assert(!current_branch->loop_entry_block);
3510
3511         endif_block = append_basic_block(ctx, "ENDIF");
3512         emit_default_branch(ctx->builder, endif_block);
3513
3514         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3515         set_basicblock_name(current_branch->next_block, "else", label_id);
3516
3517         current_branch->next_block = endif_block;
3518 }
3519
3520 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3521 {
3522         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3523
3524         assert(!current_branch->loop_entry_block);
3525
3526         emit_default_branch(ctx->builder, current_branch->next_block);
3527         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3528         set_basicblock_name(current_branch->next_block, "endif", label_id);
3529
3530         ctx->flow_depth--;
3531 }
3532
3533 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3534 {
3535         struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3536
3537         assert(current_loop->loop_entry_block);
3538
3539         emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3540
3541         LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3542         set_basicblock_name(current_loop->next_block, "endloop", label_id);
3543         ctx->flow_depth--;
3544 }
3545
3546 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3547 {
3548         struct ac_llvm_flow *flow = push_flow(ctx);
3549         LLVMBasicBlockRef if_block;
3550
3551         if_block = append_basic_block(ctx, "IF");
3552         flow->next_block = append_basic_block(ctx, "ELSE");
3553         set_basicblock_name(if_block, "if", label_id);
3554         LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3555         LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3556 }
3557
3558 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3559                  int label_id)
3560 {
3561         LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3562                                           value, ctx->f32_0, "");
3563         ac_build_ifcc(ctx, cond, label_id);
3564 }
3565
3566 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3567                   int label_id)
3568 {
3569         LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3570                                           ac_to_integer(ctx, value),
3571                                           ctx->i32_0, "");
3572         ac_build_ifcc(ctx, cond, label_id);
3573 }
3574
3575 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3576                              const char *name)
3577 {
3578         LLVMBuilderRef builder = ac->builder;
3579         LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3580         LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3581         LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3582         LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3583         LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3584         LLVMValueRef res;
3585
3586         if (first_instr) {
3587                 LLVMPositionBuilderBefore(first_builder, first_instr);
3588         } else {
3589                 LLVMPositionBuilderAtEnd(first_builder, first_block);
3590         }
3591
3592         res = LLVMBuildAlloca(first_builder, type, name);
3593         LLVMDisposeBuilder(first_builder);
3594         return res;
3595 }
3596
3597 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3598                                    LLVMTypeRef type, const char *name)
3599 {
3600         LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3601         LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3602         return ptr;
3603 }
3604
3605 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3606                          LLVMTypeRef type)
3607 {
3608         int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3609         return LLVMBuildBitCast(ctx->builder, ptr,
3610                                 LLVMPointerType(type, addr_space), "");
3611 }
3612
3613 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3614                             unsigned count)
3615 {
3616         unsigned num_components = ac_get_llvm_num_components(value);
3617         if (count == num_components)
3618                 return value;
3619
3620         LLVMValueRef masks[MAX2(count, 2)];
3621         masks[0] = ctx->i32_0;
3622         masks[1] = ctx->i32_1;
3623         for (unsigned i = 2; i < count; i++)
3624                 masks[i] = LLVMConstInt(ctx->i32, i, false);
3625
3626         if (count == 1)
3627                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3628                                                "");
3629
3630         LLVMValueRef swizzle = LLVMConstVector(masks, count);
3631         return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3632 }
3633
3634 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3635                              unsigned rshift, unsigned bitwidth)
3636 {
3637         LLVMValueRef value = param;
3638         if (rshift)
3639                 value = LLVMBuildLShr(ctx->builder, value,
3640                                       LLVMConstInt(ctx->i32, rshift, false), "");
3641
3642         if (rshift + bitwidth < 32) {
3643                 unsigned mask = (1 << bitwidth) - 1;
3644                 value = LLVMBuildAnd(ctx->builder, value,
3645                                      LLVMConstInt(ctx->i32, mask, false), "");
3646         }
3647         return value;
3648 }
3649
3650 /* Adjust the sample index according to FMASK.
3651  *
3652  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3653  * which is the identity mapping. Each nibble says which physical sample
3654  * should be fetched to get that sample.
3655  *
3656  * For example, 0x11111100 means there are only 2 samples stored and
3657  * the second sample covers 3/4 of the pixel. When reading samples 0
3658  * and 1, return physical sample 0 (determined by the first two 0s
3659  * in FMASK), otherwise return physical sample 1.
3660  *
3661  * The sample index should be adjusted as follows:
3662  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3663  */
3664 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3665                               LLVMValueRef *addr, bool is_array_tex)
3666 {
3667         struct ac_image_args fmask_load = {};
3668         fmask_load.opcode = ac_image_load;
3669         fmask_load.resource = fmask;
3670         fmask_load.dmask = 0xf;
3671         fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3672         fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3673
3674         fmask_load.coords[0] = addr[0];
3675         fmask_load.coords[1] = addr[1];
3676         if (is_array_tex)
3677                 fmask_load.coords[2] = addr[2];
3678
3679         LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3680         fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3681                                               ac->i32_0, "");
3682
3683         /* Apply the formula. */
3684         unsigned sample_chan = is_array_tex ? 3 : 2;
3685         LLVMValueRef final_sample;
3686         final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3687                                     LLVMConstInt(ac->i32, 4, 0), "");
3688         final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3689         /* Mask the sample index by 0x7, because 0x8 means an unknown value
3690          * with EQAA, so those will map to 0. */
3691         final_sample = LLVMBuildAnd(ac->builder, final_sample,
3692                                     LLVMConstInt(ac->i32, 0x7, 0), "");
3693
3694         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3695          * resource descriptor is 0 (invalid).
3696          */
3697         LLVMValueRef tmp;
3698         tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3699         tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3700         tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3701
3702         /* Replace the MSAA sample index. */
3703         addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3704                                             addr[sample_chan], "");
3705 }
3706
3707 static LLVMValueRef
3708 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3709 {
3710         ac_build_optimization_barrier(ctx, &src);
3711         return ac_build_intrinsic(ctx,
3712                         lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3713                         LLVMTypeOf(src), (LLVMValueRef []) {
3714                         src, lane },
3715                         lane == NULL ? 1 : 2,
3716                         AC_FUNC_ATTR_READNONE |
3717                         AC_FUNC_ATTR_CONVERGENT);
3718 }
3719
3720 /**
3721  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3722  * @param ctx
3723  * @param src
3724  * @param lane - id of the lane or NULL for the first active lane
3725  * @return value of the lane
3726  */
3727 LLVMValueRef
3728 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3729 {
3730         LLVMTypeRef src_type = LLVMTypeOf(src);
3731         src = ac_to_integer(ctx, src);
3732         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3733         LLVMValueRef ret;
3734
3735         if (bits == 32) {
3736                 ret = _ac_build_readlane(ctx, src, lane);
3737         } else {
3738                 assert(bits % 32 == 0);
3739                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3740                 LLVMValueRef src_vector =
3741                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3742                 ret = LLVMGetUndef(vec_type);
3743                 for (unsigned i = 0; i < bits / 32; i++) {
3744                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3745                                                 LLVMConstInt(ctx->i32, i, 0), "");
3746                         LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3747                         ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3748                                                 LLVMConstInt(ctx->i32, i, 0), "");
3749                 }
3750         }
3751         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3752 }
3753
3754 LLVMValueRef
3755 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3756 {
3757         /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
3758          */
3759         LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
3760                                           ac_get_thread_id(ctx), "");
3761         return LLVMBuildSelect(ctx->builder, pred, value, src, "");
3762 }
3763
3764 LLVMValueRef
3765 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3766 {
3767         LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3768                                                  LLVMVectorType(ctx->i32, 2),
3769                                                  "");
3770         LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3771                                                        ctx->i32_0, "");
3772         LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3773                                                        ctx->i32_1, "");
3774         LLVMValueRef val =
3775                 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3776                                    (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3777                                    2, AC_FUNC_ATTR_READNONE);
3778         val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3779                                  (LLVMValueRef []) { mask_hi, val },
3780                                  2, AC_FUNC_ATTR_READNONE);
3781         return val;
3782 }
3783
3784 enum dpp_ctrl {
3785         _dpp_quad_perm = 0x000,
3786         _dpp_row_sl = 0x100,
3787         _dpp_row_sr = 0x110,
3788         _dpp_row_rr = 0x120,
3789         dpp_wf_sl1 = 0x130,
3790         dpp_wf_rl1 = 0x134,
3791         dpp_wf_sr1 = 0x138,
3792         dpp_wf_rr1 = 0x13C,
3793         dpp_row_mirror = 0x140,
3794         dpp_row_half_mirror = 0x141,
3795         dpp_row_bcast15 = 0x142,
3796         dpp_row_bcast31 = 0x143
3797 };
3798
3799 static inline enum dpp_ctrl
3800 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3801 {
3802         assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3803         return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3804 }
3805
3806 static inline enum dpp_ctrl
3807 dpp_row_sl(unsigned amount)
3808 {
3809         assert(amount > 0 && amount < 16);
3810         return _dpp_row_sl | amount;
3811 }
3812
3813 static inline enum dpp_ctrl
3814 dpp_row_sr(unsigned amount)
3815 {
3816         assert(amount > 0 && amount < 16);
3817         return _dpp_row_sr | amount;
3818 }
3819
3820 static LLVMValueRef
3821 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3822               enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3823               bool bound_ctrl)
3824 {
3825         return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3826                                         LLVMTypeOf(old),
3827                                         (LLVMValueRef[]) {
3828                                                 old, src,
3829                                                 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3830                                                 LLVMConstInt(ctx->i32, row_mask, 0),
3831                                                 LLVMConstInt(ctx->i32, bank_mask, 0),
3832                                                 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3833                                         6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3834 }
3835
3836 static LLVMValueRef
3837 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3838              enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3839              bool bound_ctrl)
3840 {
3841         LLVMTypeRef src_type = LLVMTypeOf(src);
3842         src = ac_to_integer(ctx, src);
3843         old = ac_to_integer(ctx, old);
3844         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3845         LLVMValueRef ret;
3846         if (bits == 32) {
3847                 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3848                                     bank_mask, bound_ctrl);
3849         } else {
3850                 assert(bits % 32 == 0);
3851                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3852                 LLVMValueRef src_vector =
3853                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3854                 LLVMValueRef old_vector =
3855                         LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3856                 ret = LLVMGetUndef(vec_type);
3857                 for (unsigned i = 0; i < bits / 32; i++) {
3858                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3859                                                       LLVMConstInt(ctx->i32, i,
3860                                                                    0), "");
3861                         old = LLVMBuildExtractElement(ctx->builder, old_vector,
3862                                                       LLVMConstInt(ctx->i32, i,
3863                                                                    0), "");
3864                         LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3865                                                               dpp_ctrl,
3866                                                               row_mask,
3867                                                               bank_mask,
3868                                                               bound_ctrl);
3869                         ret = LLVMBuildInsertElement(ctx->builder, ret,
3870                                                      ret_comp,
3871                                                      LLVMConstInt(ctx->i32, i,
3872                                                                   0), "");
3873                 }
3874         }
3875         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3876 }
3877
3878 static inline unsigned
3879 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3880 {
3881         assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3882         return and_mask | (or_mask << 5) | (xor_mask << 10);
3883 }
3884
3885 static LLVMValueRef
3886 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3887 {
3888         return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
3889                                    LLVMTypeOf(src), (LLVMValueRef []) {
3890                                         src, LLVMConstInt(ctx->i32, mask, 0) },
3891                                    2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3892 }
3893
3894 LLVMValueRef
3895 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3896 {
3897         LLVMTypeRef src_type = LLVMTypeOf(src);
3898         src = ac_to_integer(ctx, src);
3899         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3900         LLVMValueRef ret;
3901         if (bits == 32) {
3902                 ret = _ac_build_ds_swizzle(ctx, src, mask);
3903         } else {
3904                 assert(bits % 32 == 0);
3905                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3906                 LLVMValueRef src_vector =
3907                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3908                 ret = LLVMGetUndef(vec_type);
3909                 for (unsigned i = 0; i < bits / 32; i++) {
3910                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3911                                                       LLVMConstInt(ctx->i32, i,
3912                                                                    0), "");
3913                         LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
3914                                                                      mask);
3915                         ret = LLVMBuildInsertElement(ctx->builder, ret,
3916                                                      ret_comp,
3917                                                      LLVMConstInt(ctx->i32, i,
3918                                                                   0), "");
3919                 }
3920         }
3921         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3922 }
3923
3924 static LLVMValueRef
3925 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3926 {
3927         char name[32], type[8];
3928         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3929         snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3930         return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
3931                                   (LLVMValueRef []) { src }, 1,
3932                                   AC_FUNC_ATTR_READNONE);
3933 }
3934
3935 static LLVMValueRef
3936 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3937                       LLVMValueRef inactive)
3938 {
3939         char name[33], type[8];
3940         LLVMTypeRef src_type = LLVMTypeOf(src);
3941         src = ac_to_integer(ctx, src);
3942         inactive = ac_to_integer(ctx, inactive);
3943         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3944         snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3945         LLVMValueRef ret =
3946                 ac_build_intrinsic(ctx, name,
3947                                         LLVMTypeOf(src), (LLVMValueRef []) {
3948                                         src, inactive }, 2,
3949                                         AC_FUNC_ATTR_READNONE |
3950                                         AC_FUNC_ATTR_CONVERGENT);
3951         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3952 }
3953
3954 static LLVMValueRef
3955 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
3956 {
3957         if (type_size == 4) {
3958                 switch (op) {
3959                 case nir_op_iadd: return ctx->i32_0;
3960                 case nir_op_fadd: return ctx->f32_0;
3961                 case nir_op_imul: return ctx->i32_1;
3962                 case nir_op_fmul: return ctx->f32_1;
3963                 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3964                 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3965                 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
3966                 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3967                 case nir_op_umax: return ctx->i32_0;
3968                 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
3969                 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
3970                 case nir_op_ior: return ctx->i32_0;
3971                 case nir_op_ixor: return ctx->i32_0;
3972                 default:
3973                         unreachable("bad reduction intrinsic");
3974                 }
3975         } else { /* type_size == 64bit */
3976                 switch (op) {
3977                 case nir_op_iadd: return ctx->i64_0;
3978                 case nir_op_fadd: return ctx->f64_0;
3979                 case nir_op_imul: return ctx->i64_1;
3980                 case nir_op_fmul: return ctx->f64_1;
3981                 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3982                 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3983                 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
3984                 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3985                 case nir_op_umax: return ctx->i64_0;
3986                 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
3987                 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
3988                 case nir_op_ior: return ctx->i64_0;
3989                 case nir_op_ixor: return ctx->i64_0;
3990                 default:
3991                         unreachable("bad reduction intrinsic");
3992                 }
3993         }
3994 }
3995
3996 static LLVMValueRef
3997 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
3998 {
3999         bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
4000         switch (op) {
4001         case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
4002         case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
4003         case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
4004         case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
4005         case nir_op_imin: return LLVMBuildSelect(ctx->builder,
4006                                         LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
4007                                         lhs, rhs, "");
4008         case nir_op_umin: return LLVMBuildSelect(ctx->builder,
4009                                         LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
4010                                         lhs, rhs, "");
4011         case nir_op_fmin: return ac_build_intrinsic(ctx,
4012                                         _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
4013                                         _64bit ? ctx->f64 : ctx->f32,
4014                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4015         case nir_op_imax: return LLVMBuildSelect(ctx->builder,
4016                                         LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
4017                                         lhs, rhs, "");
4018         case nir_op_umax: return LLVMBuildSelect(ctx->builder,
4019                                         LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
4020                                         lhs, rhs, "");
4021         case nir_op_fmax: return ac_build_intrinsic(ctx,
4022                                         _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
4023                                         _64bit ? ctx->f64 : ctx->f32,
4024                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4025         case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
4026         case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
4027         case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
4028         default:
4029                 unreachable("bad reduction intrinsic");
4030         }
4031 }
4032
4033 /**
4034  * \param maxprefix specifies that the result only needs to be correct for a
4035  *     prefix of this many threads
4036  *
4037  * TODO: add inclusive and excluse scan functions for GFX6.
4038  */
4039 static LLVMValueRef
4040 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
4041               unsigned maxprefix)
4042 {
4043         LLVMValueRef result, tmp;
4044         result = src;
4045         if (maxprefix <= 1)
4046                 return result;
4047         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
4048         result = ac_build_alu_op(ctx, result, tmp, op);
4049         if (maxprefix <= 2)
4050                 return result;
4051         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
4052         result = ac_build_alu_op(ctx, result, tmp, op);
4053         if (maxprefix <= 3)
4054                 return result;
4055         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
4056         result = ac_build_alu_op(ctx, result, tmp, op);
4057         if (maxprefix <= 4)
4058                 return result;
4059         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4060         result = ac_build_alu_op(ctx, result, tmp, op);
4061         if (maxprefix <= 8)
4062                 return result;
4063         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4064         result = ac_build_alu_op(ctx, result, tmp, op);
4065         if (maxprefix <= 16)
4066                 return result;
4067         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4068         result = ac_build_alu_op(ctx, result, tmp, op);
4069         if (maxprefix <= 32)
4070                 return result;
4071         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4072         result = ac_build_alu_op(ctx, result, tmp, op);
4073         return result;
4074 }
4075
4076 LLVMValueRef
4077 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4078 {
4079         LLVMValueRef result;
4080
4081         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4082                 LLVMBuilderRef builder = ctx->builder;
4083                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4084                 result = ac_build_ballot(ctx, src);
4085                 result = ac_build_mbcnt(ctx, result);
4086                 result = LLVMBuildAdd(builder, result, src, "");
4087                 return result;
4088         }
4089
4090         ac_build_optimization_barrier(ctx, &src);
4091
4092         LLVMValueRef identity =
4093                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4094         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4095                                   LLVMTypeOf(identity), "");
4096         result = ac_build_scan(ctx, op, result, identity, 64);
4097
4098         return ac_build_wwm(ctx, result);
4099 }
4100
4101 LLVMValueRef
4102 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4103 {
4104         LLVMValueRef result;
4105
4106         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4107                 LLVMBuilderRef builder = ctx->builder;
4108                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4109                 result = ac_build_ballot(ctx, src);
4110                 result = ac_build_mbcnt(ctx, result);
4111                 return result;
4112         }
4113
4114         ac_build_optimization_barrier(ctx, &src);
4115
4116         LLVMValueRef identity =
4117                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4118         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4119                                   LLVMTypeOf(identity), "");
4120         result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
4121         result = ac_build_scan(ctx, op, result, identity, 64);
4122
4123         return ac_build_wwm(ctx, result);
4124 }
4125
4126 LLVMValueRef
4127 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
4128 {
4129         if (cluster_size == 1) return src;
4130         ac_build_optimization_barrier(ctx, &src);
4131         LLVMValueRef result, swap;
4132         LLVMValueRef identity = get_reduction_identity(ctx, op,
4133                                                                 ac_get_type_size(LLVMTypeOf(src)));
4134         result = LLVMBuildBitCast(ctx->builder,
4135                                                                 ac_build_set_inactive(ctx, src, identity),
4136                                                                 LLVMTypeOf(identity), "");
4137         swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4138         result = ac_build_alu_op(ctx, result, swap, op);
4139         if (cluster_size == 2) return ac_build_wwm(ctx, result);
4140
4141         swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4142         result = ac_build_alu_op(ctx, result, swap, op);
4143         if (cluster_size == 4) return ac_build_wwm(ctx, result);
4144
4145         if (ctx->chip_class >= GFX8)
4146                 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4147         else
4148                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4149         result = ac_build_alu_op(ctx, result, swap, op);
4150         if (cluster_size == 8) return ac_build_wwm(ctx, result);
4151
4152         if (ctx->chip_class >= GFX8)
4153                 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4154         else
4155                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4156         result = ac_build_alu_op(ctx, result, swap, op);
4157         if (cluster_size == 16) return ac_build_wwm(ctx, result);
4158
4159         if (ctx->chip_class >= GFX8 && cluster_size != 32)
4160                 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4161         else
4162                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4163         result = ac_build_alu_op(ctx, result, swap, op);
4164         if (cluster_size == 32) return ac_build_wwm(ctx, result);
4165
4166         if (ctx->chip_class >= GFX8) {
4167                 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4168                 result = ac_build_alu_op(ctx, result, swap, op);
4169                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4170                 return ac_build_wwm(ctx, result);
4171         } else {
4172                 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4173                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4174                 result = ac_build_alu_op(ctx, result, swap, op);
4175                 return ac_build_wwm(ctx, result);
4176         }
4177 }
4178
4179 /**
4180  * "Top half" of a scan that reduces per-wave values across an entire
4181  * workgroup.
4182  *
4183  * The source value must be present in the highest lane of the wave, and the
4184  * highest lane must be live.
4185  */
4186 void
4187 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4188 {
4189         if (ws->maxwaves <= 1)
4190                 return;
4191
4192         const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
4193         LLVMBuilderRef builder = ctx->builder;
4194         LLVMValueRef tid = ac_get_thread_id(ctx);
4195         LLVMValueRef tmp;
4196
4197         tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
4198         ac_build_ifcc(ctx, tmp, 1000);
4199         LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4200         ac_build_endif(ctx, 1000);
4201 }
4202
4203 /**
4204  * "Bottom half" of a scan that reduces per-wave values across an entire
4205  * workgroup.
4206  *
4207  * The caller must place a barrier between the top and bottom halves.
4208  */
4209 void
4210 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4211 {
4212         const LLVMTypeRef type = LLVMTypeOf(ws->src);
4213         const LLVMValueRef identity =
4214                 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4215
4216         if (ws->maxwaves <= 1) {
4217                 ws->result_reduce = ws->src;
4218                 ws->result_inclusive = ws->src;
4219                 ws->result_exclusive = identity;
4220                 return;
4221         }
4222         assert(ws->maxwaves <= 32);
4223
4224         LLVMBuilderRef builder = ctx->builder;
4225         LLVMValueRef tid = ac_get_thread_id(ctx);
4226         LLVMBasicBlockRef bbs[2];
4227         LLVMValueRef phivalues_scan[2];
4228         LLVMValueRef tmp, tmp2;
4229
4230         bbs[0] = LLVMGetInsertBlock(builder);
4231         phivalues_scan[0] = LLVMGetUndef(type);
4232
4233         if (ws->enable_reduce)
4234                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4235         else if (ws->enable_inclusive)
4236                 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4237         else
4238                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4239         ac_build_ifcc(ctx, tmp, 1001);
4240         {
4241                 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4242
4243                 ac_build_optimization_barrier(ctx, &tmp);
4244
4245                 bbs[1] = LLVMGetInsertBlock(builder);
4246                 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
4247         }
4248         ac_build_endif(ctx, 1001);
4249
4250         const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4251
4252         if (ws->enable_reduce) {
4253                 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4254                 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4255         }
4256         if (ws->enable_inclusive)
4257                 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4258         if (ws->enable_exclusive) {
4259                 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4260                 tmp = ac_build_readlane(ctx, scan, tmp);
4261                 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4262                 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4263         }
4264 }
4265
4266 /**
4267  * Inclusive scan of a per-wave value across an entire workgroup.
4268  *
4269  * This implies an s_barrier instruction.
4270  *
4271  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4272  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4273  * useful manner because of the barrier in the algorithm.)
4274  */
4275 void
4276 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4277 {
4278         ac_build_wg_wavescan_top(ctx, ws);
4279         ac_build_s_barrier(ctx);
4280         ac_build_wg_wavescan_bottom(ctx, ws);
4281 }
4282
4283 /**
4284  * "Top half" of a scan that reduces per-thread values across an entire
4285  * workgroup.
4286  *
4287  * All lanes must be active when this code runs.
4288  */
4289 void
4290 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4291 {
4292         if (ws->enable_exclusive) {
4293                 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4294                 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4295                         ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4296                 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4297         } else {
4298                 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4299         }
4300
4301         bool enable_inclusive = ws->enable_inclusive;
4302         bool enable_exclusive = ws->enable_exclusive;
4303         ws->enable_inclusive = false;
4304         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4305         ac_build_wg_wavescan_top(ctx, ws);
4306         ws->enable_inclusive = enable_inclusive;
4307         ws->enable_exclusive = enable_exclusive;
4308 }
4309
4310 /**
4311  * "Bottom half" of a scan that reduces per-thread values across an entire
4312  * workgroup.
4313  *
4314  * The caller must place a barrier between the top and bottom halves.
4315  */
4316 void
4317 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4318 {
4319         bool enable_inclusive = ws->enable_inclusive;
4320         bool enable_exclusive = ws->enable_exclusive;
4321         ws->enable_inclusive = false;
4322         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4323         ac_build_wg_wavescan_bottom(ctx, ws);
4324         ws->enable_inclusive = enable_inclusive;
4325         ws->enable_exclusive = enable_exclusive;
4326
4327         /* ws->result_reduce is already the correct value */
4328         if (ws->enable_inclusive)
4329                 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
4330         if (ws->enable_exclusive)
4331                 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4332 }
4333
4334 /**
4335  * A scan that reduces per-thread values across an entire workgroup.
4336  *
4337  * The caller must ensure that all lanes are active when this code runs
4338  * (WWM is insufficient!), because there is an implied barrier.
4339  */
4340 void
4341 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4342 {
4343         ac_build_wg_scan_top(ctx, ws);
4344         ac_build_s_barrier(ctx);
4345         ac_build_wg_scan_bottom(ctx, ws);
4346 }
4347
4348 LLVMValueRef
4349 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
4350                 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
4351 {
4352         unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4353         if (ctx->chip_class >= GFX8) {
4354                 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4355         } else {
4356                 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4357         }
4358 }
4359
4360 LLVMValueRef
4361 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4362 {
4363         index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4364         return ac_build_intrinsic(ctx,
4365                   "llvm.amdgcn.ds.bpermute", ctx->i32,
4366                   (LLVMValueRef []) {index, src}, 2,
4367                   AC_FUNC_ATTR_READNONE |
4368                   AC_FUNC_ATTR_CONVERGENT);
4369 }
4370
4371 LLVMValueRef
4372 ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
4373                    unsigned bitsize)
4374 {
4375         LLVMTypeRef type;
4376         char *intr;
4377
4378         if (bitsize == 16) {
4379                 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4380                 type = ctx->i16;
4381         } else if (bitsize == 32) {
4382                 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4383                 type = ctx->i32;
4384         } else {
4385                 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4386                 type = ctx->i32;
4387         }
4388
4389         LLVMValueRef params[] = {
4390                 src0,
4391         };
4392         return ac_build_intrinsic(ctx, intr, type, params, 1,
4393                                   AC_FUNC_ATTR_READNONE);
4394 }
4395 LLVMValueRef
4396 ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
4397                     unsigned bitsize)
4398 {
4399         LLVMTypeRef type;
4400         char *intr;
4401
4402         if (bitsize == 16) {
4403                 intr = "llvm.amdgcn.frexp.mant.f16";
4404                 type = ctx->f16;
4405         } else if (bitsize == 32) {
4406                 intr = "llvm.amdgcn.frexp.mant.f32";
4407                 type = ctx->f32;
4408         } else {
4409                 intr = "llvm.amdgcn.frexp.mant.f64";
4410                 type = ctx->f64;
4411         }
4412
4413         LLVMValueRef params[] = {
4414                 src0,
4415         };
4416         return ac_build_intrinsic(ctx, intr, type, params, 1,
4417                                   AC_FUNC_ATTR_READNONE);
4418 }
4419
4420 /*
4421  * this takes an I,J coordinate pair,
4422  * and works out the X and Y derivatives.
4423  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4424  */
4425 LLVMValueRef
4426 ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4427 {
4428         LLVMValueRef result[4], a;
4429         unsigned i;
4430
4431         for (i = 0; i < 2; i++) {
4432                 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
4433                                             LLVMConstInt(ctx->i32, i, false), "");
4434                 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4435                 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4436         }
4437         return ac_build_gather_values(ctx, result, 4);
4438 }
4439
4440 LLVMValueRef
4441 ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4442 {
4443         LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
4444                                                  ctx->i1, NULL, 0,
4445                                                  AC_FUNC_ATTR_READNONE);
4446         result = LLVMBuildNot(ctx->builder, result, "");
4447         return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4448 }