src/amd/common/ac_llvm_build.c

   1 /*
   2  * Copyright 2014 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sub license, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  19  *
  20  * The above copyright notice and this permission notice (including the
  21  * next paragraph) shall be included in all copies or substantial portions
  22  * of the Software.
  23  *
  24  */
  25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
  26 #include "ac_llvm_build.h"
  27
  28 #include <llvm-c/Core.h>
  29
  30 #include "c11/threads.h"
  31
  32 #include <assert.h>
  33 #include <stdio.h>
  34
  35 #include "ac_llvm_util.h"
  36 #include "ac_exp_param.h"
  37 #include "util/bitscan.h"
  38 #include "util/macros.h"
  39 #include "util/u_atomic.h"
  40 #include "util/u_math.h"
  41 #include "sid.h"
  42
  43 #include "shader_enums.h"
  44
  45 #define AC_LLVM_INITIAL_CF_DEPTH 4
  46
  47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
  48  */
  49 struct ac_llvm_flow {
  50         /* Loop exit or next part of if/else/endif. */
  51         LLVMBasicBlockRef next_block;
  52         LLVMBasicBlockRef loop_entry_block;
  53 };
  54
  55 /* Initialize module-independent parts of the context.
  56  *
  57  * The caller is responsible for initializing ctx::module and ctx::builder.
  58  */
  59 void
  60 ac_llvm_context_init(struct ac_llvm_context *ctx,
  61                      struct ac_llvm_compiler *compiler,
  62                      enum chip_class chip_class, enum radeon_family family,
  63                      enum ac_float_mode float_mode, unsigned wave_size)
  64 {
  65         LLVMValueRef args[1];
  66
  67         ctx->context = LLVMContextCreate();
  68
  69         ctx->chip_class = chip_class;
  70         ctx->family = family;
  71         ctx->wave_size = wave_size;
  72         ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32
  73                                                        : compiler->tm,
  74                                        ctx->context);
  75         ctx->builder = ac_create_builder(ctx->context, float_mode);
  76
  77         ctx->voidt = LLVMVoidTypeInContext(ctx->context);
  78         ctx->i1 = LLVMInt1TypeInContext(ctx->context);
  79         ctx->i8 = LLVMInt8TypeInContext(ctx->context);
  80         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
  81         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
  82         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
  83         ctx->intptr = ctx->i32;
  84         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
  85         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
  86         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
  87         ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
  88         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
  89         ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
  90         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
  91         ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
  92         ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
  93         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  94         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  95         ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
  96
  97         ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
  98         ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
  99         ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
 100         ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
 101         ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
 102         ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
 103         ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
 104         ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
 105         ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
 106         ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
 107         ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
 108         ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 109         ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
 110         ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
 111
 112         ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
 113         ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
 114
 115         ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 116                                                      "range", 5);
 117
 118         ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 119                                                                "invariant.load", 14);
 120
 121         ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
 122
 123         args[0] = LLVMConstReal(ctx->f32, 2.5);
 124         ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
 125
 126         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 127                                                         "amdgpu.uniform", 14);
 128
 129         ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
 130         ctx->flow = calloc(1, sizeof(*ctx->flow));
 131 }
 132
 133 void
 134 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
 135 {
 136         free(ctx->flow->stack);
 137         free(ctx->flow);
 138         ctx->flow = NULL;
 139 }
 140
 141 int
 142 ac_get_llvm_num_components(LLVMValueRef value)
 143 {
 144         LLVMTypeRef type = LLVMTypeOf(value);
 145         unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
 146                                       ? LLVMGetVectorSize(type)
 147                                       : 1;
 148         return num_components;
 149 }
 150
 151 LLVMValueRef
 152 ac_llvm_extract_elem(struct ac_llvm_context *ac,
 153                      LLVMValueRef value,
 154                      int index)
 155 {
 156         if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
 157                 assert(index == 0);
 158                 return value;
 159         }
 160
 161         return LLVMBuildExtractElement(ac->builder, value,
 162                                        LLVMConstInt(ac->i32, index, false), "");
 163 }
 164
 165 int
 166 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
 167 {
 168         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 169                 type = LLVMGetElementType(type);
 170
 171         if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
 172                 return LLVMGetIntTypeWidth(type);
 173
 174         if (type == ctx->f16)
 175                 return 16;
 176         if (type == ctx->f32)
 177                 return 32;
 178         if (type == ctx->f64)
 179                 return 64;
 180
 181         unreachable("Unhandled type kind in get_elem_bits");
 182 }
 183
 184 unsigned
 185 ac_get_type_size(LLVMTypeRef type)
 186 {
 187         LLVMTypeKind kind = LLVMGetTypeKind(type);
 188
 189         switch (kind) {
 190         case LLVMIntegerTypeKind:
 191                 return LLVMGetIntTypeWidth(type) / 8;
 192         case LLVMHalfTypeKind:
 193                 return 2;
 194         case LLVMFloatTypeKind:
 195                 return 4;
 196         case LLVMDoubleTypeKind:
 197                 return 8;
 198         case LLVMPointerTypeKind:
 199                 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
 200                         return 4;
 201                 return 8;
 202         case LLVMVectorTypeKind:
 203                 return LLVMGetVectorSize(type) *
 204                        ac_get_type_size(LLVMGetElementType(type));
 205         case LLVMArrayTypeKind:
 206                 return LLVMGetArrayLength(type) *
 207                        ac_get_type_size(LLVMGetElementType(type));
 208         default:
 209                 assert(0);
 210                 return 0;
 211         }
 212 }
 213
 214 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 215 {
 216         if (t == ctx->i8)
 217                 return ctx->i8;
 218         else if (t == ctx->f16 || t == ctx->i16)
 219                 return ctx->i16;
 220         else if (t == ctx->f32 || t == ctx->i32)
 221                 return ctx->i32;
 222         else if (t == ctx->f64 || t == ctx->i64)
 223                 return ctx->i64;
 224         else
 225                 unreachable("Unhandled integer size");
 226 }
 227
 228 LLVMTypeRef
 229 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 230 {
 231         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 232                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 233                 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
 234                                       LLVMGetVectorSize(t));
 235         }
 236         if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
 237                 switch (LLVMGetPointerAddressSpace(t)) {
 238                 case AC_ADDR_SPACE_GLOBAL:
 239                         return ctx->i64;
 240                 case AC_ADDR_SPACE_LDS:
 241                         return ctx->i32;
 242                 default:
 243                         unreachable("unhandled address space");
 244                 }
 245         }
 246         return to_integer_type_scalar(ctx, t);
 247 }
 248
 249 LLVMValueRef
 250 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 251 {
 252         LLVMTypeRef type = LLVMTypeOf(v);
 253         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
 254                 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 255         }
 256         return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 257 }
 258
 259 LLVMValueRef
 260 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
 261 {
 262         LLVMTypeRef type = LLVMTypeOf(v);
 263         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
 264                 return v;
 265         return ac_to_integer(ctx, v);
 266 }
 267
 268 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 269 {
 270         if (t == ctx->i8)
 271                 return ctx->i8;
 272         else if (t == ctx->i16 || t == ctx->f16)
 273                 return ctx->f16;
 274         else if (t == ctx->i32 || t == ctx->f32)
 275                 return ctx->f32;
 276         else if (t == ctx->i64 || t == ctx->f64)
 277                 return ctx->f64;
 278         else
 279                 unreachable("Unhandled float size");
 280 }
 281
 282 LLVMTypeRef
 283 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 284 {
 285         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 286                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 287                 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
 288                                       LLVMGetVectorSize(t));
 289         }
 290         return to_float_type_scalar(ctx, t);
 291 }
 292
 293 LLVMValueRef
 294 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
 295 {
 296         LLVMTypeRef type = LLVMTypeOf(v);
 297         return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
 298 }
 299
 300
 301 LLVMValueRef
 302 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
 303                    LLVMTypeRef return_type, LLVMValueRef *params,
 304                    unsigned param_count, unsigned attrib_mask)
 305 {
 306         LLVMValueRef function, call;
 307         bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 308
 309         function = LLVMGetNamedFunction(ctx->module, name);
 310         if (!function) {
 311                 LLVMTypeRef param_types[32], function_type;
 312                 unsigned i;
 313
 314                 assert(param_count <= 32);
 315
 316                 for (i = 0; i < param_count; ++i) {
 317                         assert(params[i]);
 318                         param_types[i] = LLVMTypeOf(params[i]);
 319                 }
 320                 function_type =
 321                     LLVMFunctionType(return_type, param_types, param_count, 0);
 322                 function = LLVMAddFunction(ctx->module, name, function_type);
 323
 324                 LLVMSetFunctionCallConv(function, LLVMCCallConv);
 325                 LLVMSetLinkage(function, LLVMExternalLinkage);
 326
 327                 if (!set_callsite_attrs)
 328                         ac_add_func_attributes(ctx->context, function, attrib_mask);
 329         }
 330
 331         call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
 332         if (set_callsite_attrs)
 333                 ac_add_func_attributes(ctx->context, call, attrib_mask);
 334         return call;
 335 }
 336
 337 /**
 338  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
 339  * intrinsic names).
 340  */
 341 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
 342 {
 343         LLVMTypeRef elem_type = type;
 344
 345         assert(bufsize >= 8);
 346
 347         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
 348                 int ret = snprintf(buf, bufsize, "v%u",
 349                                         LLVMGetVectorSize(type));
 350                 if (ret < 0) {
 351                         char *type_name = LLVMPrintTypeToString(type);
 352                         fprintf(stderr, "Error building type name for: %s\n",
 353                                 type_name);
 354                         return;
 355                 }
 356                 elem_type = LLVMGetElementType(type);
 357                 buf += ret;
 358                 bufsize -= ret;
 359         }
 360         switch (LLVMGetTypeKind(elem_type)) {
 361         default: break;
 362         case LLVMIntegerTypeKind:
 363                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 364                 break;
 365         case LLVMHalfTypeKind:
 366                 snprintf(buf, bufsize, "f16");
 367                 break;
 368         case LLVMFloatTypeKind:
 369                 snprintf(buf, bufsize, "f32");
 370                 break;
 371         case LLVMDoubleTypeKind:
 372                 snprintf(buf, bufsize, "f64");
 373                 break;
 374         }
 375 }
 376
 377 /**
 378  * Helper function that builds an LLVM IR PHI node and immediately adds
 379  * incoming edges.
 380  */
 381 LLVMValueRef
 382 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
 383              unsigned count_incoming, LLVMValueRef *values,
 384              LLVMBasicBlockRef *blocks)
 385 {
 386         LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
 387         LLVMAddIncoming(phi, values, blocks, count_incoming);
 388         return phi;
 389 }
 390
 391 void ac_build_s_barrier(struct ac_llvm_context *ctx)
 392 {
 393         ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
 394                            0, AC_FUNC_ATTR_CONVERGENT);
 395 }
 396
 397 /* Prevent optimizations (at least of memory accesses) across the current
 398  * point in the program by emitting empty inline assembly that is marked as
 399  * having side effects.
 400  *
 401  * Optionally, a value can be passed through the inline assembly to prevent
 402  * LLVM from hoisting calls to ReadNone functions.
 403  */
 404 void
 405 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
 406                               LLVMValueRef *pvgpr)
 407 {
 408         static int counter = 0;
 409
 410         LLVMBuilderRef builder = ctx->builder;
 411         char code[16];
 412
 413         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
 414
 415         if (!pvgpr) {
 416                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
 417                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
 418                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
 419         } else {
 420                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
 421                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
 422                 LLVMValueRef vgpr = *pvgpr;
 423                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
 424                 unsigned vgpr_size = ac_get_type_size(vgpr_type);
 425                 LLVMValueRef vgpr0;
 426
 427                 assert(vgpr_size % 4 == 0);
 428
 429                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
 430                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
 431                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
 432                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
 433                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
 434
 435                 *pvgpr = vgpr;
 436         }
 437 }
 438
 439 LLVMValueRef
 440 ac_build_shader_clock(struct ac_llvm_context *ctx)
 441 {
 442         const char *intr = HAVE_LLVM >= 0x0900 && ctx->chip_class >= GFX8 ?
 443                                 "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter";
 444         LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0);
 445         return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
 446 }
 447
 448 LLVMValueRef
 449 ac_build_ballot(struct ac_llvm_context *ctx,
 450                 LLVMValueRef value)
 451 {
 452         const char *name;
 453
 454         if (HAVE_LLVM >= 0x900) {
 455                 if (ctx->wave_size == 64)
 456                         name = "llvm.amdgcn.icmp.i64.i32";
 457                 else
 458                         name = "llvm.amdgcn.icmp.i32.i32";
 459         } else {
 460                 name = "llvm.amdgcn.icmp.i32";
 461         }
 462         LLVMValueRef args[3] = {
 463                 value,
 464                 ctx->i32_0,
 465                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
 466         };
 467
 468         /* We currently have no other way to prevent LLVM from lifting the icmp
 469          * calls to a dominating basic block.
 470          */
 471         ac_build_optimization_barrier(ctx, &args[0]);
 472
 473         args[0] = ac_to_integer(ctx, args[0]);
 474
 475         return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3,
 476                                   AC_FUNC_ATTR_NOUNWIND |
 477                                   AC_FUNC_ATTR_READNONE |
 478                                   AC_FUNC_ATTR_CONVERGENT);
 479 }
 480
 481 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
 482                                  LLVMValueRef value)
 483 {
 484         const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
 485         LLVMValueRef args[3] = {
 486                 value,
 487                 ctx->i1false,
 488                 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
 489         };
 490
 491         assert(HAVE_LLVM >= 0x0800);
 492         return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
 493                                   AC_FUNC_ATTR_NOUNWIND |
 494                                   AC_FUNC_ATTR_READNONE |
 495                                   AC_FUNC_ATTR_CONVERGENT);
 496 }
 497
 498 LLVMValueRef
 499 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
 500 {
 501         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 502         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 503         return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
 504 }
 505
 506 LLVMValueRef
 507 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
 508 {
 509         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 510         return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
 511                              LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
 512 }
 513
 514 LLVMValueRef
 515 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
 516 {
 517         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 518         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 519
 520         LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 521                                          vote_set, active_set, "");
 522         LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 523                                           vote_set,
 524                                           LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
 525         return LLVMBuildOr(ctx->builder, all, none, "");
 526 }
 527
 528 LLVMValueRef
 529 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
 530                                unsigned value_count, unsigned component)
 531 {
 532         LLVMValueRef vec = NULL;
 533
 534         if (value_count == 1) {
 535                 return values[component];
 536         } else if (!value_count)
 537                 unreachable("value_count is 0");
 538
 539         for (unsigned i = component; i < value_count + component; i++) {
 540                 LLVMValueRef value = values[i];
 541
 542                 if (i == component)
 543                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 544                 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
 545                 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
 546         }
 547         return vec;
 548 }
 549
 550 LLVMValueRef
 551 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 552                                 LLVMValueRef *values,
 553                                 unsigned value_count,
 554                                 unsigned value_stride,
 555                                 bool load,
 556                                 bool always_vector)
 557 {
 558         LLVMBuilderRef builder = ctx->builder;
 559         LLVMValueRef vec = NULL;
 560         unsigned i;
 561
 562         if (value_count == 1 && !always_vector) {
 563                 if (load)
 564                         return LLVMBuildLoad(builder, values[0], "");
 565                 return values[0];
 566         } else if (!value_count)
 567                 unreachable("value_count is 0");
 568
 569         for (i = 0; i < value_count; i++) {
 570                 LLVMValueRef value = values[i * value_stride];
 571                 if (load)
 572                         value = LLVMBuildLoad(builder, value, "");
 573
 574                 if (!i)
 575                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 576                 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
 577                 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
 578         }
 579         return vec;
 580 }
 581
 582 LLVMValueRef
 583 ac_build_gather_values(struct ac_llvm_context *ctx,
 584                        LLVMValueRef *values,
 585                        unsigned value_count)
 586 {
 587         return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 588 }
 589
 590 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
 591  * channels with undef. Extract at most src_channels components from the input.
 592  */
 593 static LLVMValueRef
 594 ac_build_expand(struct ac_llvm_context *ctx,
 595                 LLVMValueRef value,
 596                 unsigned src_channels,
 597                 unsigned dst_channels)
 598 {
 599         LLVMTypeRef elemtype;
 600         LLVMValueRef chan[dst_channels];
 601
 602         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
 603                 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
 604
 605                 if (src_channels == dst_channels && vec_size == dst_channels)
 606                         return value;
 607
 608                 src_channels = MIN2(src_channels, vec_size);
 609
 610                 for (unsigned i = 0; i < src_channels; i++)
 611                         chan[i] = ac_llvm_extract_elem(ctx, value, i);
 612
 613                 elemtype = LLVMGetElementType(LLVMTypeOf(value));
 614         } else {
 615                 if (src_channels) {
 616                         assert(src_channels == 1);
 617                         chan[0] = value;
 618                 }
 619                 elemtype = LLVMTypeOf(value);
 620         }
 621
 622         for (unsigned i = src_channels; i < dst_channels; i++)
 623                 chan[i] = LLVMGetUndef(elemtype);
 624
 625         return ac_build_gather_values(ctx, chan, dst_channels);
 626 }
 627
 628 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
 629  * with undef. Extract at most num_channels components from the input.
 630  */
 631 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
 632                                      LLVMValueRef value,
 633                                      unsigned num_channels)
 634 {
 635         return ac_build_expand(ctx, value, num_channels, 4);
 636 }
 637
 638 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
 639 {
 640         unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
 641         const char *name;
 642
 643         if (type_size == 2)
 644                 name = "llvm.rint.f16";
 645         else if (type_size == 4)
 646                 name = "llvm.rint.f32";
 647         else
 648                 name = "llvm.rint.f64";
 649
 650         return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
 651                                   AC_FUNC_ATTR_READNONE);
 652 }
 653
 654 LLVMValueRef
 655 ac_build_fdiv(struct ac_llvm_context *ctx,
 656               LLVMValueRef num,
 657               LLVMValueRef den)
 658 {
 659         /* If we do (num / den), LLVM >= 7.0 does:
 660          *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
 661          *
 662          * If we do (num * (1 / den)), LLVM does:
 663          *    return num * v_rcp_f32(den);
 664          */
 665         LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
 666         LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
 667         LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 668
 669         /* Use v_rcp_f32 instead of precise division. */
 670         if (!LLVMIsConstant(ret))
 671                 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
 672         return ret;
 673 }
 674
 675 /* See fast_idiv_by_const.h. */
 676 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
 677 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
 678                                 LLVMValueRef num,
 679                                 LLVMValueRef multiplier,
 680                                 LLVMValueRef pre_shift,
 681                                 LLVMValueRef post_shift,
 682                                 LLVMValueRef increment)
 683 {
 684         LLVMBuilderRef builder = ctx->builder;
 685
 686         num = LLVMBuildLShr(builder, num, pre_shift, "");
 687         num = LLVMBuildMul(builder,
 688                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 689                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 690         num = LLVMBuildAdd(builder, num,
 691                            LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
 692         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 693         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 694         return LLVMBuildLShr(builder, num, post_shift, "");
 695 }
 696
 697 /* See fast_idiv_by_const.h. */
 698 /* If num != UINT_MAX, this more efficient version can be used. */
 699 /* Set: increment = util_fast_udiv_info::increment; */
 700 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
 701                                     LLVMValueRef num,
 702                                     LLVMValueRef multiplier,
 703                                     LLVMValueRef pre_shift,
 704                                     LLVMValueRef post_shift,
 705                                     LLVMValueRef increment)
 706 {
 707         LLVMBuilderRef builder = ctx->builder;
 708
 709         num = LLVMBuildLShr(builder, num, pre_shift, "");
 710         num = LLVMBuildNUWAdd(builder, num, increment, "");
 711         num = LLVMBuildMul(builder,
 712                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 713                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 714         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 715         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 716         return LLVMBuildLShr(builder, num, post_shift, "");
 717 }
 718
 719 /* See fast_idiv_by_const.h. */
 720 /* Both operands must fit in 31 bits and the divisor must not be 1. */
 721 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
 722                                               LLVMValueRef num,
 723                                               LLVMValueRef multiplier,
 724                                               LLVMValueRef post_shift)
 725 {
 726         LLVMBuilderRef builder = ctx->builder;
 727
 728         num = LLVMBuildMul(builder,
 729                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 730                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 731         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 732         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 733         return LLVMBuildLShr(builder, num, post_shift, "");
 734 }
 735
 736 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
 737  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
 738  * already multiplied by two. id is the cube face number.
 739  */
 740 struct cube_selection_coords {
 741         LLVMValueRef stc[2];
 742         LLVMValueRef ma;
 743         LLVMValueRef id;
 744 };
 745
 746 static void
 747 build_cube_intrinsic(struct ac_llvm_context *ctx,
 748                      LLVMValueRef in[3],
 749                      struct cube_selection_coords *out)
 750 {
 751         LLVMTypeRef f32 = ctx->f32;
 752
 753         out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
 754                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 755         out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
 756                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 757         out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
 758                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 759         out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
 760                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 761 }
 762
 763 /**
 764  * Build a manual selection sequence for cube face sc/tc coordinates and
 765  * major axis vector (multiplied by 2 for consistency) for the given
 766  * vec3 \p coords, for the face implied by \p selcoords.
 767  *
 768  * For the major axis, we always adjust the sign to be in the direction of
 769  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
 770  * the selcoords major axis.
 771  */
 772 static void build_cube_select(struct ac_llvm_context *ctx,
 773                               const struct cube_selection_coords *selcoords,
 774                               const LLVMValueRef *coords,
 775                               LLVMValueRef *out_st,
 776                               LLVMValueRef *out_ma)
 777 {
 778         LLVMBuilderRef builder = ctx->builder;
 779         LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
 780         LLVMValueRef is_ma_positive;
 781         LLVMValueRef sgn_ma;
 782         LLVMValueRef is_ma_z, is_not_ma_z;
 783         LLVMValueRef is_ma_y;
 784         LLVMValueRef is_ma_x;
 785         LLVMValueRef sgn;
 786         LLVMValueRef tmp;
 787
 788         is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
 789                 selcoords->ma, LLVMConstReal(f32, 0.0), "");
 790         sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
 791                 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
 792
 793         is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
 794         is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
 795         is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
 796                 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
 797         is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 798
 799         /* Select sc */
 800         tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
 801         sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
 802                 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
 803                         LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 804         out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 805
 806         /* Select tc */
 807         tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
 808         sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
 809                 LLVMConstReal(f32, -1.0), "");
 810         out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 811
 812         /* Select ma */
 813         tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
 814                 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
 815         tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 816                                  ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
 817         *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
 818 }
 819
 820 void
 821 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
 822                        bool is_deriv, bool is_array, bool is_lod,
 823                        LLVMValueRef *coords_arg,
 824                        LLVMValueRef *derivs_arg)
 825 {
 826
 827         LLVMBuilderRef builder = ctx->builder;
 828         struct cube_selection_coords selcoords;
 829         LLVMValueRef coords[3];
 830         LLVMValueRef invma;
 831
 832         if (is_array && !is_lod) {
 833                 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
 834
 835                 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
 836                  *
 837                  *    "For Array forms, the array layer used will be
 838                  *
 839                  *       max(0, min(d−1, floor(layer+0.5)))
 840                  *
 841                  *     where d is the depth of the texture array and layer
 842                  *     comes from the component indicated in the tables below.
 843                  *     Workaroudn for an issue where the layer is taken from a
 844                  *     helper invocation which happens to fall on a different
 845                  *     layer due to extrapolation."
 846                  *
 847                  * GFX8 and earlier attempt to implement this in hardware by
 848                  * clamping the value of coords[2] = (8 * layer) + face.
 849                  * Unfortunately, this means that the we end up with the wrong
 850                  * face when clamping occurs.
 851                  *
 852                  * Clamp the layer earlier to work around the issue.
 853                  */
 854                 if (ctx->chip_class <= GFX8) {
 855                         LLVMValueRef ge0;
 856                         ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
 857                         tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
 858                 }
 859
 860                 coords_arg[3] = tmp;
 861         }
 862
 863         build_cube_intrinsic(ctx, coords_arg, &selcoords);
 864
 865         invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 866                         ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
 867         invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
 868
 869         for (int i = 0; i < 2; ++i)
 870                 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
 871
 872         coords[2] = selcoords.id;
 873
 874         if (is_deriv && derivs_arg) {
 875                 LLVMValueRef derivs[4];
 876                 int axis;
 877
 878                 /* Convert cube derivatives to 2D derivatives. */
 879                 for (axis = 0; axis < 2; axis++) {
 880                         LLVMValueRef deriv_st[2];
 881                         LLVMValueRef deriv_ma;
 882
 883                         /* Transform the derivative alongside the texture
 884                          * coordinate. Mathematically, the correct formula is
 885                          * as follows. Assume we're projecting onto the +Z face
 886                          * and denote by dx/dh the derivative of the (original)
 887                          * X texture coordinate with respect to horizontal
 888                          * window coordinates. The projection onto the +Z face
 889                          * plane is:
 890                          *
 891                          *   f(x,z) = x/z
 892                          *
 893                          * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
 894                          *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
 895                          *
 896                          * This motivatives the implementation below.
 897                          *
 898                          * Whether this actually gives the expected results for
 899                          * apps that might feed in derivatives obtained via
 900                          * finite differences is anyone's guess. The OpenGL spec
 901                          * seems awfully quiet about how textureGrad for cube
 902                          * maps should be handled.
 903                          */
 904                         build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
 905                                           deriv_st, &deriv_ma);
 906
 907                         deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
 908
 909                         for (int i = 0; i < 2; ++i)
 910                                 derivs[axis * 2 + i] =
 911                                         LLVMBuildFSub(builder,
 912                                                 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
 913                                                 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
 914                 }
 915
 916                 memcpy(derivs_arg, derivs, sizeof(derivs));
 917         }
 918
 919         /* Shift the texture coordinate. This must be applied after the
 920          * derivative calculation.
 921          */
 922         for (int i = 0; i < 2; ++i)
 923                 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
 924
 925         if (is_array) {
 926                 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 927                 /* coords_arg.w component - array_index for cube arrays */
 928                 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
 929         }
 930
 931         memcpy(coords_arg, coords, sizeof(coords));
 932 }
 933
 934
 935 LLVMValueRef
 936 ac_build_fs_interp(struct ac_llvm_context *ctx,
 937                    LLVMValueRef llvm_chan,
 938                    LLVMValueRef attr_number,
 939                    LLVMValueRef params,
 940                    LLVMValueRef i,
 941                    LLVMValueRef j)
 942 {
 943         LLVMValueRef args[5];
 944         LLVMValueRef p1;
 945
 946         args[0] = i;
 947         args[1] = llvm_chan;
 948         args[2] = attr_number;
 949         args[3] = params;
 950
 951         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
 952                                 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 953
 954         args[0] = p1;
 955         args[1] = j;
 956         args[2] = llvm_chan;
 957         args[3] = attr_number;
 958         args[4] = params;
 959
 960         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
 961                                   ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 962 }
 963
 964 LLVMValueRef
 965 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
 966                        LLVMValueRef llvm_chan,
 967                        LLVMValueRef attr_number,
 968                        LLVMValueRef params,
 969                        LLVMValueRef i,
 970                        LLVMValueRef j)
 971 {
 972         LLVMValueRef args[6];
 973         LLVMValueRef p1;
 974
 975         args[0] = i;
 976         args[1] = llvm_chan;
 977         args[2] = attr_number;
 978         args[3] = ctx->i1false;
 979         args[4] = params;
 980
 981         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
 982                                 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 983
 984         args[0] = p1;
 985         args[1] = j;
 986         args[2] = llvm_chan;
 987         args[3] = attr_number;
 988         args[4] = ctx->i1false;
 989         args[5] = params;
 990
 991         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
 992                                   ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
 993 }
 994
 995 LLVMValueRef
 996 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 997                        LLVMValueRef parameter,
 998                        LLVMValueRef llvm_chan,
 999                        LLVMValueRef attr_number,
1000                        LLVMValueRef params)
1001 {
1002         LLVMValueRef args[4];
1003
1004         args[0] = parameter;
1005         args[1] = llvm_chan;
1006         args[2] = attr_number;
1007         args[3] = params;
1008
1009         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
1010                                   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1011 }
1012
1013 LLVMValueRef
1014 ac_build_gep_ptr(struct ac_llvm_context *ctx,
1015                  LLVMValueRef base_ptr,
1016                  LLVMValueRef index)
1017 {
1018         return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1019 }
1020
1021 LLVMValueRef
1022 ac_build_gep0(struct ac_llvm_context *ctx,
1023               LLVMValueRef base_ptr,
1024               LLVMValueRef index)
1025 {
1026         LLVMValueRef indices[2] = {
1027                 ctx->i32_0,
1028                 index,
1029         };
1030         return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1031 }
1032
1033 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
1034                                   LLVMValueRef index)
1035 {
1036         return LLVMBuildPointerCast(ctx->builder,
1037                                     LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1038                                     LLVMTypeOf(ptr), "");
1039 }
1040
1041 void
1042 ac_build_indexed_store(struct ac_llvm_context *ctx,
1043                        LLVMValueRef base_ptr, LLVMValueRef index,
1044                        LLVMValueRef value)
1045 {
1046         LLVMBuildStore(ctx->builder, value,
1047                        ac_build_gep0(ctx, base_ptr, index));
1048 }
1049
1050 /**
1051  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1052  * It's equivalent to doing a load from &base_ptr[index].
1053  *
1054  * \param base_ptr  Where the array starts.
1055  * \param index     The element index into the array.
1056  * \param uniform   Whether the base_ptr and index can be assumed to be
1057  *                  dynamically uniform (i.e. load to an SGPR)
1058  * \param invariant Whether the load is invariant (no other opcodes affect it)
1059  * \param no_unsigned_wraparound
1060  *    For all possible re-associations and re-distributions of an expression
1061  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1062  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1063  *    does not result in an unsigned integer wraparound. This is used for
1064  *    optimal code generation of 32-bit pointer arithmetic.
1065  *
1066  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1067  *    integer wraparound can't be an imm offset in s_load_dword, because
1068  *    the instruction performs "addr + offset" in 64 bits.
1069  *
1070  *    Expected usage for bindless textures by chaining GEPs:
1071  *      // possible unsigned wraparound, don't use InBounds:
1072  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1073  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1074  *
1075  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1076  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1077  */
1078 static LLVMValueRef
1079 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1080                      LLVMValueRef index, bool uniform, bool invariant,
1081                      bool no_unsigned_wraparound)
1082 {
1083         LLVMValueRef pointer, result;
1084
1085         if (no_unsigned_wraparound &&
1086             LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1087                 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1088         else
1089                 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1090
1091         if (uniform)
1092                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1093         result = LLVMBuildLoad(ctx->builder, pointer, "");
1094         if (invariant)
1095                 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1096         return result;
1097 }
1098
1099 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1100                            LLVMValueRef index)
1101 {
1102         return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1103 }
1104
1105 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1106                                      LLVMValueRef base_ptr, LLVMValueRef index)
1107 {
1108         return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1109 }
1110
1111 /* This assumes that there is no unsigned integer wraparound during the address
1112  * computation, excluding all GEPs within base_ptr. */
1113 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1114                                    LLVMValueRef base_ptr, LLVMValueRef index)
1115 {
1116         return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1117 }
1118
1119 /* See ac_build_load_custom() documentation. */
1120 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1121                                    LLVMValueRef base_ptr, LLVMValueRef index)
1122 {
1123         return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1124 }
1125
1126 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx,
1127                                       unsigned cache_policy)
1128 {
1129         return cache_policy |
1130                (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1131 }
1132
1133 static void
1134 ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
1135                                    LLVMValueRef rsrc,
1136                                    LLVMValueRef data,
1137                                    LLVMValueRef vindex,
1138                                    LLVMValueRef voffset,
1139                                    unsigned num_channels,
1140                                    unsigned cache_policy,
1141                                    bool use_format)
1142 {
1143         LLVMValueRef args[] = {
1144                 data,
1145                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1146                 vindex ? vindex : ctx->i32_0,
1147                 voffset,
1148                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1149                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1150         };
1151         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1152
1153         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1154         char name[256];
1155
1156         if (use_format) {
1157                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
1158                          type_names[func]);
1159         } else {
1160                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
1161                          type_names[func]);
1162         }
1163
1164         ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
1165                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1166 }
1167
1168 static void
1169 ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
1170                                    LLVMValueRef rsrc,
1171                                    LLVMValueRef data,
1172                                    LLVMValueRef vindex,
1173                                    LLVMValueRef voffset,
1174                                    LLVMValueRef soffset,
1175                                    unsigned num_channels,
1176                                    LLVMTypeRef return_channel_type,
1177                                    unsigned cache_policy,
1178                                    bool use_format,
1179                                    bool structurized)
1180 {
1181         LLVMValueRef args[6];
1182         int idx = 0;
1183         args[idx++] = data;
1184         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1185         if (structurized)
1186                 args[idx++] = vindex ? vindex : ctx->i32_0;
1187         args[idx++] = voffset ? voffset : ctx->i32_0;
1188         args[idx++] = soffset ? soffset : ctx->i32_0;
1189         args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1190         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1191         const char *indexing_kind = structurized ? "struct" : "raw";
1192         char name[256], type_name[8];
1193
1194         LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
1195         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1196
1197         if (use_format) {
1198                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1199                          indexing_kind, type_name);
1200         } else {
1201                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1202                          indexing_kind, type_name);
1203         }
1204
1205         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1206                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1207 }
1208
1209 void
1210 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1211                              LLVMValueRef rsrc,
1212                              LLVMValueRef data,
1213                              LLVMValueRef vindex,
1214                              LLVMValueRef voffset,
1215                              unsigned num_channels,
1216                              unsigned cache_policy)
1217 {
1218         if (HAVE_LLVM >= 0x800) {
1219                 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
1220                                                    voffset, NULL, num_channels,
1221                                                    ctx->f32, cache_policy,
1222                                                    true, true);
1223         } else {
1224                 ac_build_llvm7_buffer_store_common(ctx, rsrc, data, vindex, voffset,
1225                                                    num_channels, cache_policy,
1226                                                    true);
1227         }
1228 }
1229
1230 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1231  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1232  * or v4i32 (num_channels=3,4).
1233  */
1234 void
1235 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1236                             LLVMValueRef rsrc,
1237                             LLVMValueRef vdata,
1238                             unsigned num_channels,
1239                             LLVMValueRef voffset,
1240                             LLVMValueRef soffset,
1241                             unsigned inst_offset,
1242                             unsigned cache_policy,
1243                             bool swizzle_enable_hint)
1244 {
1245         /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1246          * intrinsics. */
1247         if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1248                 LLVMValueRef v[3], v01;
1249
1250                 for (int i = 0; i < 3; i++) {
1251                         v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1252                                         LLVMConstInt(ctx->i32, i, 0), "");
1253                 }
1254                 v01 = ac_build_gather_values(ctx, v, 2);
1255
1256                 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1257                                             soffset, inst_offset, cache_policy,
1258                                             swizzle_enable_hint);
1259                 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1260                                             soffset, inst_offset + 8,
1261                                             cache_policy,
1262                                             swizzle_enable_hint);
1263                 return;
1264         }
1265
1266         /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1267          * (voffset is swizzled, but soffset isn't swizzled).
1268          * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1269          */
1270         if (!swizzle_enable_hint) {
1271                 LLVMValueRef offset = soffset;
1272
1273                 if (inst_offset)
1274                         offset = LLVMBuildAdd(ctx->builder, offset,
1275                                               LLVMConstInt(ctx->i32, inst_offset, 0), "");
1276
1277                 if (HAVE_LLVM >= 0x800) {
1278                         ac_build_llvm8_buffer_store_common(ctx, rsrc,
1279                                                            ac_to_float(ctx, vdata),
1280                                                            ctx->i32_0,
1281                                                            voffset, offset,
1282                                                            num_channels,
1283                                                            ctx->f32,
1284                                                            cache_policy,
1285                                                            false, false);
1286                 } else {
1287                         if (voffset)
1288                                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1289
1290                         ac_build_llvm7_buffer_store_common(ctx, rsrc,
1291                                                            ac_to_float(ctx, vdata),
1292                                                            ctx->i32_0, offset,
1293                                                            num_channels, cache_policy,
1294                                                            false);
1295                 }
1296                 return;
1297         }
1298
1299         static const unsigned dfmts[] = {
1300                 V_008F0C_BUF_DATA_FORMAT_32,
1301                 V_008F0C_BUF_DATA_FORMAT_32_32,
1302                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1303                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1304         };
1305         unsigned dfmt = dfmts[num_channels - 1];
1306         unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1307         LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1308
1309         ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1310                                    immoffset, num_channels, dfmt, nfmt, cache_policy);
1311 }
1312
1313 static LLVMValueRef
1314 ac_build_llvm7_buffer_load_common(struct ac_llvm_context *ctx,
1315                                   LLVMValueRef rsrc,
1316                                   LLVMValueRef vindex,
1317                                   LLVMValueRef voffset,
1318                                   unsigned num_channels,
1319                                   unsigned cache_policy,
1320                                   bool can_speculate,
1321                                   bool use_format)
1322 {
1323         LLVMValueRef args[] = {
1324                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1325                 vindex ? vindex : ctx->i32_0,
1326                 voffset,
1327                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1328                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1329         };
1330         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1331
1332         LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1333         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1334         char name[256];
1335
1336         if (use_format) {
1337                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1338                          type_names[func]);
1339         } else {
1340                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1341                          type_names[func]);
1342         }
1343
1344         return ac_build_intrinsic(ctx, name, types[func], args,
1345                                   ARRAY_SIZE(args),
1346                                   ac_get_load_intr_attribs(can_speculate));
1347 }
1348
1349 static LLVMValueRef
1350 ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
1351                                   LLVMValueRef rsrc,
1352                                   LLVMValueRef vindex,
1353                                   LLVMValueRef voffset,
1354                                   LLVMValueRef soffset,
1355                                   unsigned num_channels,
1356                                   LLVMTypeRef channel_type,
1357                                   unsigned cache_policy,
1358                                   bool can_speculate,
1359                                   bool use_format,
1360                                   bool structurized)
1361 {
1362         LLVMValueRef args[5];
1363         int idx = 0;
1364         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1365         if (structurized)
1366                 args[idx++] = vindex ? vindex : ctx->i32_0;
1367         args[idx++] = voffset ? voffset : ctx->i32_0;
1368         args[idx++] = soffset ? soffset : ctx->i32_0;
1369         args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1370         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1371         const char *indexing_kind = structurized ? "struct" : "raw";
1372         char name[256], type_name[8];
1373
1374         LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1375         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1376
1377         if (use_format) {
1378                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1379                          indexing_kind, type_name);
1380         } else {
1381                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1382                          indexing_kind, type_name);
1383         }
1384
1385         return ac_build_intrinsic(ctx, name, type, args, idx,
1386                                   ac_get_load_intr_attribs(can_speculate));
1387 }
1388
1389 LLVMValueRef
1390 ac_build_buffer_load(struct ac_llvm_context *ctx,
1391                      LLVMValueRef rsrc,
1392                      int num_channels,
1393                      LLVMValueRef vindex,
1394                      LLVMValueRef voffset,
1395                      LLVMValueRef soffset,
1396                      unsigned inst_offset,
1397                      unsigned cache_policy,
1398                      bool can_speculate,
1399                      bool allow_smem)
1400 {
1401         LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1402         if (voffset)
1403                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1404         if (soffset)
1405                 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1406
1407         if (allow_smem && !(cache_policy & ac_slc) &&
1408             (!(cache_policy & ac_glc) || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= GFX8))) {
1409                 assert(vindex == NULL);
1410
1411                 LLVMValueRef result[8];
1412
1413                 for (int i = 0; i < num_channels; i++) {
1414                         if (i) {
1415                                 offset = LLVMBuildAdd(ctx->builder, offset,
1416                                                       LLVMConstInt(ctx->i32, 4, 0), "");
1417                         }
1418                         const char *intrname =
1419                                 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
1420                                                     : "llvm.SI.load.const.v4i32";
1421                         unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
1422                         LLVMValueRef args[3] = {
1423                                 rsrc,
1424                                 offset,
1425                                 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1426                         };
1427                         result[i] = ac_build_intrinsic(ctx, intrname,
1428                                                        ctx->f32, args, num_args,
1429                                                        AC_FUNC_ATTR_READNONE |
1430                                                        (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
1431                 }
1432                 if (num_channels == 1)
1433                         return result[0];
1434
1435                 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1436                         result[num_channels++] = LLVMGetUndef(ctx->f32);
1437                 return ac_build_gather_values(ctx, result, num_channels);
1438         }
1439
1440         if (HAVE_LLVM >= 0x0800) {
1441                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
1442                                                          offset, ctx->i32_0,
1443                                                          num_channels, ctx->f32,
1444                                                          cache_policy,
1445                                                          can_speculate, false,
1446                                                          false);
1447         }
1448
1449         return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, offset,
1450                                                  num_channels, cache_policy,
1451                                                  can_speculate, false);
1452 }
1453
1454 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1455                                          LLVMValueRef rsrc,
1456                                          LLVMValueRef vindex,
1457                                          LLVMValueRef voffset,
1458                                          unsigned num_channels,
1459                                          unsigned cache_policy,
1460                                          bool can_speculate)
1461 {
1462         if (HAVE_LLVM >= 0x800) {
1463                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1464                                                          num_channels, ctx->f32,
1465                                                          cache_policy, can_speculate, true, true);
1466         }
1467         return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, voffset,
1468                                                  num_channels, cache_policy,
1469                                                  can_speculate, true);
1470 }
1471
1472 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1473                                                   LLVMValueRef rsrc,
1474                                                   LLVMValueRef vindex,
1475                                                   LLVMValueRef voffset,
1476                                                   unsigned num_channels,
1477                                                   unsigned cache_policy,
1478                                                   bool can_speculate)
1479 {
1480         if (HAVE_LLVM >= 0x800) {
1481                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1482                                                          num_channels, ctx->f32,
1483                                                          cache_policy, can_speculate, true, true);
1484         }
1485
1486         LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1487         LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
1488         stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1489
1490         LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1491                                                       LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1492                                                       elem_count, stride, "");
1493
1494         LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1495                                                        LLVMConstInt(ctx->i32, 2, 0), "");
1496
1497         return ac_build_llvm7_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1498                                                  num_channels, cache_policy,
1499                                                  can_speculate, true);
1500 }
1501
1502 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
1503 /// value for LLVM8+ tbuffer intrinsics.
1504 static unsigned
1505 ac_get_tbuffer_format(struct ac_llvm_context *ctx,
1506                       unsigned dfmt, unsigned nfmt)
1507 {
1508         if (ctx->chip_class >= GFX10) {
1509                 unsigned format;
1510                 switch (dfmt) {
1511                 default: unreachable("bad dfmt");
1512                 case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break;
1513                 case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
1514                 case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
1515                 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
1516                 case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
1517                 case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
1518                 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
1519                 case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
1520                 case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
1521                 case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break;
1522                 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
1523                 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break;
1524                 }
1525
1526                 // Use the regularity properties of the combined format enum.
1527                 //
1528                 // Note: float is incompatible with 8-bit data formats,
1529                 //       [us]{norm,scaled} are incomparible with 32-bit data formats.
1530                 //       [us]scaled are not writable.
1531                 switch (nfmt) {
1532                 case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
1533                 case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
1534                 case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
1535                 case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
1536                 default: unreachable("bad nfmt");
1537                 case V_008F0C_BUF_NUM_FORMAT_UINT: break;
1538                 case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
1539                 case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
1540                 }
1541
1542                 return format;
1543         } else {
1544                 return dfmt | (nfmt << 4);
1545         }
1546 }
1547
1548 static LLVMValueRef
1549 ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
1550                             LLVMValueRef rsrc,
1551                             LLVMValueRef vindex,
1552                             LLVMValueRef voffset,
1553                             LLVMValueRef soffset,
1554                             unsigned num_channels,
1555                             unsigned dfmt,
1556                             unsigned nfmt,
1557                             unsigned cache_policy,
1558                             bool can_speculate,
1559                             bool structurized)
1560 {
1561         LLVMValueRef args[6];
1562         int idx = 0;
1563         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1564         if (structurized)
1565                 args[idx++] = vindex ? vindex : ctx->i32_0;
1566         args[idx++] = voffset ? voffset : ctx->i32_0;
1567         args[idx++] = soffset ? soffset : ctx->i32_0;
1568         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
1569         args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1570         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1571         const char *indexing_kind = structurized ? "struct" : "raw";
1572         char name[256], type_name[8];
1573
1574         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1575         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1576
1577         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1578                  indexing_kind, type_name);
1579
1580         return ac_build_intrinsic(ctx, name, type, args, idx,
1581                                   ac_get_load_intr_attribs(can_speculate));
1582 }
1583
1584 static LLVMValueRef
1585 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1586                             LLVMValueRef rsrc,
1587                             LLVMValueRef vindex,
1588                             LLVMValueRef voffset,
1589                             LLVMValueRef soffset,
1590                             LLVMValueRef immoffset,
1591                             unsigned num_channels,
1592                             unsigned dfmt,
1593                             unsigned nfmt,
1594                             unsigned cache_policy,
1595                             bool can_speculate,
1596                             bool structurized) /* only matters for LLVM 8+ */
1597 {
1598         if (HAVE_LLVM >= 0x800) {
1599                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1600
1601                 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
1602                                                    soffset, num_channels,
1603                                                    dfmt, nfmt, cache_policy,
1604                                                    can_speculate, structurized);
1605         }
1606
1607         LLVMValueRef args[] = {
1608                 rsrc,
1609                 vindex ? vindex : ctx->i32_0,
1610                 voffset,
1611                 soffset,
1612                 immoffset,
1613                 LLVMConstInt(ctx->i32, dfmt, false),
1614                 LLVMConstInt(ctx->i32, nfmt, false),
1615                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
1616                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
1617         };
1618         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1619         LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1620         const char *type_names[] = {"i32", "v2i32", "v4i32"};
1621         char name[256];
1622
1623         snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
1624                  type_names[func]);
1625
1626         return ac_build_intrinsic(ctx, name, types[func], args, 9,
1627                                   ac_get_load_intr_attribs(can_speculate));
1628 }
1629
1630 LLVMValueRef
1631 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1632                              LLVMValueRef rsrc,
1633                              LLVMValueRef vindex,
1634                              LLVMValueRef voffset,
1635                              LLVMValueRef soffset,
1636                              LLVMValueRef immoffset,
1637                              unsigned num_channels,
1638                              unsigned dfmt,
1639                              unsigned nfmt,
1640                              unsigned cache_policy,
1641                              bool can_speculate)
1642 {
1643         return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1644                                      immoffset, num_channels, dfmt, nfmt,
1645                                      cache_policy, can_speculate, true);
1646 }
1647
1648 LLVMValueRef
1649 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1650                           LLVMValueRef rsrc,
1651                           LLVMValueRef voffset,
1652                           LLVMValueRef soffset,
1653                           LLVMValueRef immoffset,
1654                           unsigned num_channels,
1655                           unsigned dfmt,
1656                           unsigned nfmt,
1657                           unsigned cache_policy,
1658                           bool can_speculate)
1659 {
1660         return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1661                                      immoffset, num_channels, dfmt, nfmt,
1662                                      cache_policy, can_speculate, false);
1663 }
1664
1665 LLVMValueRef
1666 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1667                             LLVMValueRef rsrc,
1668                             LLVMValueRef voffset,
1669                             LLVMValueRef soffset,
1670                             LLVMValueRef immoffset,
1671                             unsigned cache_policy)
1672 {
1673         LLVMValueRef res;
1674
1675         if (HAVE_LLVM >= 0x900) {
1676                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1677
1678                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1679                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1680                                                         voffset, soffset,
1681                                                         1, ctx->i16, cache_policy,
1682                                                         false, false, false);
1683         } else {
1684                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1685                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1686
1687                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1688                                                 immoffset, 1, dfmt, nfmt, cache_policy,
1689                                                 false);
1690
1691                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1692         }
1693
1694         return res;
1695 }
1696
1697 LLVMValueRef
1698 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1699                            LLVMValueRef rsrc,
1700                            LLVMValueRef voffset,
1701                            LLVMValueRef soffset,
1702                            LLVMValueRef immoffset,
1703                            unsigned cache_policy)
1704 {
1705         LLVMValueRef res;
1706
1707         if (HAVE_LLVM >= 0x900) {
1708                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1709
1710                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1711                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1712                                                         voffset, soffset,
1713                                                         1, ctx->i8, cache_policy,
1714                                                         false, false, false);
1715         } else {
1716                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1717                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1718
1719                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1720                                                 immoffset, 1, dfmt, nfmt, cache_policy,
1721                                                 false);
1722
1723                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1724         }
1725
1726         return res;
1727 }
1728
1729 /**
1730  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1731  *
1732  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1733  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1734  */
1735 static LLVMValueRef
1736 ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits)
1737 {
1738         assert(LLVMTypeOf(src) == ctx->i32);
1739
1740         LLVMValueRef tmp;
1741         LLVMValueRef mantissa;
1742         mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1743
1744         /* Converting normal numbers is just a shift + correcting the exponent bias */
1745         unsigned normal_shift = 23 - mant_bits;
1746         unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1747         LLVMValueRef shifted, normal;
1748
1749         shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1750         normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1751
1752         /* Converting nan/inf numbers is the same, but with a different exponent update */
1753         LLVMValueRef naninf;
1754         naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1755
1756         /* Converting denormals is the complex case: determine the leading zeros of the
1757          * mantissa to obtain the correct shift for the mantissa and exponent correction.
1758          */
1759         LLVMValueRef denormal;
1760         LLVMValueRef params[2] = {
1761                 mantissa,
1762                 ctx->i1true, /* result can be undef when arg is 0 */
1763         };
1764         LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32,
1765                                               params, 2, AC_FUNC_ATTR_READNONE);
1766
1767         /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1768         tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1769         denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1770
1771         unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1772         tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1773         tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1774         denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1775
1776         /* Select the final result. */
1777         LLVMValueRef result;
1778
1779         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1780                             LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1781         result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1782
1783         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1784                             LLVMConstInt(ctx->i32, 1 << mant_bits, false), "");
1785         result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1786
1787         tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1788         result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1789
1790         return ac_to_float(ctx, result);
1791 }
1792
1793 /**
1794  * Generate a fully general open coded buffer format fetch with all required
1795  * fixups suitable for vertex fetch, using non-format buffer loads.
1796  *
1797  * Some combinations of argument values have special interpretations:
1798  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1799  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1800  *
1801  * \param log_size log(size of channel in bytes)
1802  * \param num_channels number of channels (1 to 4)
1803  * \param format AC_FETCH_FORMAT_xxx value
1804  * \param reverse whether XYZ channels are reversed
1805  * \param known_aligned whether the source is known to be aligned to hardware's
1806  *                      effective element size for loading the given format
1807  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1808  * \param rsrc buffer resource descriptor
1809  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1810  */
1811 LLVMValueRef
1812 ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
1813                                unsigned log_size,
1814                                unsigned num_channels,
1815                                unsigned format,
1816                                bool reverse,
1817                                bool known_aligned,
1818                                LLVMValueRef rsrc,
1819                                LLVMValueRef vindex,
1820                                LLVMValueRef voffset,
1821                                LLVMValueRef soffset,
1822                                unsigned cache_policy,
1823                                bool can_speculate)
1824 {
1825         LLVMValueRef tmp;
1826         unsigned load_log_size = log_size;
1827         unsigned load_num_channels = num_channels;
1828         if (log_size == 3) {
1829                 load_log_size = 2;
1830                 if (format == AC_FETCH_FORMAT_FLOAT) {
1831                         load_num_channels = 2 * num_channels;
1832                 } else {
1833                         load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1834                 }
1835         }
1836
1837         int log_recombine = 0;
1838         if (ctx->chip_class == GFX6 && !known_aligned) {
1839                 /* Avoid alignment restrictions by loading one byte at a time. */
1840                 load_num_channels <<= load_log_size;
1841                 log_recombine = load_log_size;
1842                 load_log_size = 0;
1843         } else if (load_num_channels == 2 || load_num_channels == 4) {
1844                 log_recombine = -util_logbase2(load_num_channels);
1845                 load_num_channels = 1;
1846                 load_log_size += -log_recombine;
1847         }
1848
1849         assert(load_log_size >= 2 || HAVE_LLVM >= 0x0900);
1850
1851         LLVMValueRef loads[32]; /* up to 32 bytes */
1852         for (unsigned i = 0; i < load_num_channels; ++i) {
1853                 tmp = LLVMBuildAdd(ctx->builder, soffset,
1854                                    LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1855                 if (HAVE_LLVM >= 0x0800) {
1856                         LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 :
1857                                                    load_log_size == 1 ? ctx->i16 : ctx->i32;
1858                         unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1859                         loads[i] = ac_build_llvm8_buffer_load_common(
1860                                         ctx, rsrc, vindex, voffset, tmp,
1861                                         num_channels, channel_type, cache_policy,
1862                                         can_speculate, false, true);
1863                 } else {
1864                         tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, "");
1865                         loads[i] = ac_build_llvm7_buffer_load_common(
1866                                         ctx, rsrc, vindex, tmp,
1867                                         1 << (load_log_size - 2), cache_policy, can_speculate, false);
1868                 }
1869                 if (load_log_size >= 2)
1870                         loads[i] = ac_to_integer(ctx, loads[i]);
1871         }
1872
1873         if (log_recombine > 0) {
1874                 /* Recombine bytes if necessary (GFX6 only) */
1875                 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1876
1877                 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1878                         LLVMValueRef accum = NULL;
1879                         for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1880                                 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1881                                 if (i == 0) {
1882                                         accum = tmp;
1883                                 } else {
1884                                         tmp = LLVMBuildShl(ctx->builder, tmp,
1885                                                            LLVMConstInt(dst_type, 8 * i, false), "");
1886                                         accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1887                                 }
1888                         }
1889                         loads[dst] = accum;
1890                 }
1891         } else if (log_recombine < 0) {
1892                 /* Split vectors of dwords */
1893                 if (load_log_size > 2) {
1894                         assert(load_num_channels == 1);
1895                         LLVMValueRef loaded = loads[0];
1896                         unsigned log_split = load_log_size - 2;
1897                         log_recombine += log_split;
1898                         load_num_channels = 1 << log_split;
1899                         load_log_size = 2;
1900                         for (unsigned i = 0; i < load_num_channels; ++i) {
1901                                 tmp = LLVMConstInt(ctx->i32, i, false);
1902                                 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1903                         }
1904                 }
1905
1906                 /* Further split dwords and shorts if required */
1907                 if (log_recombine < 0) {
1908                         for (unsigned src = load_num_channels,
1909                                       dst = load_num_channels << -log_recombine;
1910                              src > 0; --src) {
1911                                 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1912                                 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1913                                 LLVMValueRef loaded = loads[src - 1];
1914                                 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1915                                 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1916                                         tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1917                                         tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1918                                         loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1919                                 }
1920                         }
1921                 }
1922         }
1923
1924         if (log_size == 3) {
1925                 if (format == AC_FETCH_FORMAT_FLOAT) {
1926                         for (unsigned i = 0; i < num_channels; ++i) {
1927                                 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1928                                 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1929                         }
1930                 } else if (format == AC_FETCH_FORMAT_FIXED) {
1931                         /* 10_11_11_FLOAT */
1932                         LLVMValueRef data = loads[0];
1933                         LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1934                         LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1935                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1936                         LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1937                         LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1938
1939                         loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1940                         loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1941                         loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1942
1943                         num_channels = 3;
1944                         log_size = 2;
1945                         format = AC_FETCH_FORMAT_FLOAT;
1946                 } else {
1947                         /* 2_10_10_10 data formats */
1948                         LLVMValueRef data = loads[0];
1949                         LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1950                         LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1951                         loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1952                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1953                         loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1954                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1955                         loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1956                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1957                         loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1958
1959                         num_channels = 4;
1960                 }
1961         }
1962
1963         if (format == AC_FETCH_FORMAT_FLOAT) {
1964                 if (log_size != 2) {
1965                         for (unsigned chan = 0; chan < num_channels; ++chan) {
1966                                 tmp = ac_to_float(ctx, loads[chan]);
1967                                 if (log_size == 3)
1968                                         tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1969                                 else if (log_size == 1)
1970                                         tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1971                                 loads[chan] = ac_to_integer(ctx, tmp);
1972                         }
1973                 }
1974         } else if (format == AC_FETCH_FORMAT_UINT) {
1975                 if (log_size != 2) {
1976                         for (unsigned chan = 0; chan < num_channels; ++chan)
1977                                 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1978                 }
1979         } else if (format == AC_FETCH_FORMAT_SINT) {
1980                 if (log_size != 2) {
1981                         for (unsigned chan = 0; chan < num_channels; ++chan)
1982                                 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1983                 }
1984         } else {
1985                 bool unsign = format == AC_FETCH_FORMAT_UNORM ||
1986                               format == AC_FETCH_FORMAT_USCALED ||
1987                               format == AC_FETCH_FORMAT_UINT;
1988
1989                 for (unsigned chan = 0; chan < num_channels; ++chan) {
1990                         if (unsign) {
1991                                 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1992                         } else {
1993                                 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1994                         }
1995
1996                         LLVMValueRef scale = NULL;
1997                         if (format == AC_FETCH_FORMAT_FIXED) {
1998                                 assert(log_size == 2);
1999                                 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
2000                         } else if (format == AC_FETCH_FORMAT_UNORM) {
2001                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
2002                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
2003                         } else if (format == AC_FETCH_FORMAT_SNORM) {
2004                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
2005                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
2006                         }
2007                         if (scale)
2008                                 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
2009
2010                         if (format == AC_FETCH_FORMAT_SNORM) {
2011                                 /* Clamp to [-1, 1] */
2012                                 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
2013                                 LLVMValueRef clamp =
2014                                         LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
2015                                 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
2016                         }
2017
2018                         loads[chan] = ac_to_integer(ctx, tmp);
2019                 }
2020         }
2021
2022         while (num_channels < 4) {
2023                 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
2024                         loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
2025                 } else {
2026                         loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
2027                 }
2028                 num_channels++;
2029         }
2030
2031         if (reverse) {
2032                 tmp = loads[0];
2033                 loads[0] = loads[2];
2034                 loads[2] = tmp;
2035         }
2036
2037         return ac_build_gather_values(ctx, loads, 4);
2038 }
2039
2040 static void
2041 ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
2042                              LLVMValueRef rsrc,
2043                              LLVMValueRef vdata,
2044                              LLVMValueRef vindex,
2045                              LLVMValueRef voffset,
2046                              LLVMValueRef soffset,
2047                              unsigned num_channels,
2048                              unsigned dfmt,
2049                              unsigned nfmt,
2050                              unsigned cache_policy,
2051                              bool structurized)
2052 {
2053         LLVMValueRef args[7];
2054         int idx = 0;
2055         args[idx++] = vdata;
2056         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
2057         if (structurized)
2058                 args[idx++] = vindex ? vindex : ctx->i32_0;
2059         args[idx++] = voffset ? voffset : ctx->i32_0;
2060         args[idx++] = soffset ? soffset : ctx->i32_0;
2061         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
2062         args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
2063         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
2064         const char *indexing_kind = structurized ? "struct" : "raw";
2065         char name[256], type_name[8];
2066
2067         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
2068         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
2069
2070         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
2071                  indexing_kind, type_name);
2072
2073         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
2074                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2075 }
2076
2077 static void
2078 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
2079                        LLVMValueRef rsrc,
2080                        LLVMValueRef vdata,
2081                        LLVMValueRef vindex,
2082                        LLVMValueRef voffset,
2083                        LLVMValueRef soffset,
2084                        LLVMValueRef immoffset,
2085                        unsigned num_channels,
2086                        unsigned dfmt,
2087                        unsigned nfmt,
2088                        unsigned cache_policy,
2089                        bool structurized) /* only matters for LLVM 8+ */
2090 {
2091         if (HAVE_LLVM >= 0x800) {
2092                 voffset = LLVMBuildAdd(ctx->builder,
2093                                        voffset ? voffset : ctx->i32_0,
2094                                        immoffset, "");
2095
2096                 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
2097                                              soffset, num_channels, dfmt, nfmt,
2098                                              cache_policy, structurized);
2099         } else {
2100                 LLVMValueRef params[] = {
2101                         vdata,
2102                         rsrc,
2103                         vindex ? vindex : ctx->i32_0,
2104                         voffset ? voffset : ctx->i32_0,
2105                         soffset ? soffset : ctx->i32_0,
2106                         immoffset,
2107                         LLVMConstInt(ctx->i32, dfmt, false),
2108                         LLVMConstInt(ctx->i32, nfmt, false),
2109                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
2110                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
2111                 };
2112                 unsigned func = CLAMP(num_channels, 1, 3) - 1;
2113                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
2114                 char name[256];
2115
2116                 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
2117                          type_names[func]);
2118
2119                 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
2120                                    AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2121         }
2122 }
2123
2124 void
2125 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
2126                               LLVMValueRef rsrc,
2127                               LLVMValueRef vdata,
2128                               LLVMValueRef vindex,
2129                               LLVMValueRef voffset,
2130                               LLVMValueRef soffset,
2131                               LLVMValueRef immoffset,
2132                               unsigned num_channels,
2133                               unsigned dfmt,
2134                               unsigned nfmt,
2135                               unsigned cache_policy)
2136 {
2137         ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
2138                                immoffset, num_channels, dfmt, nfmt, cache_policy,
2139                                true);
2140 }
2141
2142 void
2143 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
2144                            LLVMValueRef rsrc,
2145                            LLVMValueRef vdata,
2146                            LLVMValueRef voffset,
2147                            LLVMValueRef soffset,
2148                            LLVMValueRef immoffset,
2149                            unsigned num_channels,
2150                            unsigned dfmt,
2151                            unsigned nfmt,
2152                            unsigned cache_policy)
2153 {
2154         ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
2155                                immoffset, num_channels, dfmt, nfmt, cache_policy,
2156                                false);
2157 }
2158
2159 void
2160 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
2161                              LLVMValueRef rsrc,
2162                              LLVMValueRef vdata,
2163                              LLVMValueRef voffset,
2164                              LLVMValueRef soffset,
2165                              unsigned cache_policy)
2166 {
2167         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
2168
2169         if (HAVE_LLVM >= 0x900) {
2170                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2171                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2172                                                    voffset, soffset, 1,
2173                                                    ctx->i16, cache_policy,
2174                                                    false, false);
2175         } else {
2176                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
2177                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2178
2179                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2180
2181                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2182                                            ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2183         }
2184 }
2185
2186 void
2187 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
2188                             LLVMValueRef rsrc,
2189                             LLVMValueRef vdata,
2190                             LLVMValueRef voffset,
2191                             LLVMValueRef soffset,
2192                             unsigned cache_policy)
2193 {
2194         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
2195
2196         if (HAVE_LLVM >= 0x900) {
2197                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2198                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2199                                                    voffset, soffset, 1,
2200                                                    ctx->i8, cache_policy,
2201                                                    false, false);
2202         } else {
2203                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
2204                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2205
2206                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2207
2208                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2209                                            ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2210         }
2211 }
2212 /**
2213  * Set range metadata on an instruction.  This can only be used on load and
2214  * call instructions.  If you know an instruction can only produce the values
2215  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
2216  * \p lo is the minimum value inclusive.
2217  * \p hi is the maximum value exclusive.
2218  */
2219 static void set_range_metadata(struct ac_llvm_context *ctx,
2220                                LLVMValueRef value, unsigned lo, unsigned hi)
2221 {
2222         LLVMValueRef range_md, md_args[2];
2223         LLVMTypeRef type = LLVMTypeOf(value);
2224         LLVMContextRef context = LLVMGetTypeContext(type);
2225
2226         md_args[0] = LLVMConstInt(type, lo, false);
2227         md_args[1] = LLVMConstInt(type, hi, false);
2228         range_md = LLVMMDNodeInContext(context, md_args, 2);
2229         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
2230 }
2231
2232 LLVMValueRef
2233 ac_get_thread_id(struct ac_llvm_context *ctx)
2234 {
2235         LLVMValueRef tid;
2236
2237         LLVMValueRef tid_args[2];
2238         tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
2239         tid_args[1] = ctx->i32_0;
2240         tid_args[1] = ac_build_intrinsic(ctx,
2241                                          "llvm.amdgcn.mbcnt.lo", ctx->i32,
2242                                          tid_args, 2, AC_FUNC_ATTR_READNONE);
2243
2244         if (ctx->wave_size == 32) {
2245                 tid = tid_args[1];
2246         } else {
2247                 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
2248                                          ctx->i32, tid_args,
2249                                          2, AC_FUNC_ATTR_READNONE);
2250         }
2251         set_range_metadata(ctx, tid, 0, ctx->wave_size);
2252         return tid;
2253 }
2254
2255 /*
2256  * AMD GCN implements derivatives using the local data store (LDS)
2257  * All writes to the LDS happen in all executing threads at
2258  * the same time. TID is the Thread ID for the current
2259  * thread and is a value between 0 and 63, representing
2260  * the thread's position in the wavefront.
2261  *
2262  * For the pixel shader threads are grouped into quads of four pixels.
2263  * The TIDs of the pixels of a quad are:
2264  *
2265  *  +------+------+
2266  *  |4n + 0|4n + 1|
2267  *  +------+------+
2268  *  |4n + 2|4n + 3|
2269  *  +------+------+
2270  *
2271  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
2272  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
2273  * the current pixel's column, and masking with 0xfffffffe yields the TID
2274  * of the left pixel of the current pixel's row.
2275  *
2276  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
2277  * adding 2 yields the TID of the pixel below the top pixel.
2278  */
2279 LLVMValueRef
2280 ac_build_ddxy(struct ac_llvm_context *ctx,
2281               uint32_t mask,
2282               int idx,
2283               LLVMValueRef val)
2284 {
2285         unsigned tl_lanes[4], trbl_lanes[4];
2286         char name[32], type[8];
2287         LLVMValueRef tl, trbl;
2288         LLVMTypeRef result_type;
2289         LLVMValueRef result;
2290
2291         result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
2292
2293         if (result_type == ctx->f16)
2294                 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
2295
2296         for (unsigned i = 0; i < 4; ++i) {
2297                 tl_lanes[i] = i & mask;
2298                 trbl_lanes[i] = (i & mask) + idx;
2299         }
2300
2301         tl = ac_build_quad_swizzle(ctx, val,
2302                                    tl_lanes[0], tl_lanes[1],
2303                                    tl_lanes[2], tl_lanes[3]);
2304         trbl = ac_build_quad_swizzle(ctx, val,
2305                                      trbl_lanes[0], trbl_lanes[1],
2306                                      trbl_lanes[2], trbl_lanes[3]);
2307
2308         if (result_type == ctx->f16) {
2309                 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
2310                 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
2311         }
2312
2313         tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
2314         trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
2315         result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
2316
2317         ac_build_type_name_for_intr(result_type, type, sizeof(type));
2318         snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
2319
2320         return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
2321 }
2322
2323 void
2324 ac_build_sendmsg(struct ac_llvm_context *ctx,
2325                  uint32_t msg,
2326                  LLVMValueRef wave_id)
2327 {
2328         LLVMValueRef args[2];
2329         args[0] = LLVMConstInt(ctx->i32, msg, false);
2330         args[1] = wave_id;
2331         ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
2332 }
2333
2334 LLVMValueRef
2335 ac_build_imsb(struct ac_llvm_context *ctx,
2336               LLVMValueRef arg,
2337               LLVMTypeRef dst_type)
2338 {
2339         LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
2340                                               dst_type, &arg, 1,
2341                                               AC_FUNC_ATTR_READNONE);
2342
2343         /* The HW returns the last bit index from MSB, but NIR/TGSI wants
2344          * the index from LSB. Invert it by doing "31 - msb". */
2345         msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
2346                            msb, "");
2347
2348         LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
2349         LLVMValueRef cond = LLVMBuildOr(ctx->builder,
2350                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2351                                                       arg, ctx->i32_0, ""),
2352                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2353                                                       arg, all_ones, ""), "");
2354
2355         return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
2356 }
2357
2358 LLVMValueRef
2359 ac_build_umsb(struct ac_llvm_context *ctx,
2360               LLVMValueRef arg,
2361               LLVMTypeRef dst_type)
2362 {
2363         const char *intrin_name;
2364         LLVMTypeRef type;
2365         LLVMValueRef highest_bit;
2366         LLVMValueRef zero;
2367         unsigned bitsize;
2368
2369         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
2370         switch (bitsize) {
2371         case 64:
2372                 intrin_name = "llvm.ctlz.i64";
2373                 type = ctx->i64;
2374                 highest_bit = LLVMConstInt(ctx->i64, 63, false);
2375                 zero = ctx->i64_0;
2376                 break;
2377         case 32:
2378                 intrin_name = "llvm.ctlz.i32";
2379                 type = ctx->i32;
2380                 highest_bit = LLVMConstInt(ctx->i32, 31, false);
2381                 zero = ctx->i32_0;
2382                 break;
2383         case 16:
2384                 intrin_name = "llvm.ctlz.i16";
2385                 type = ctx->i16;
2386                 highest_bit = LLVMConstInt(ctx->i16, 15, false);
2387                 zero = ctx->i16_0;
2388                 break;
2389         case 8:
2390                 intrin_name = "llvm.ctlz.i8";
2391                 type = ctx->i8;
2392                 highest_bit = LLVMConstInt(ctx->i8, 7, false);
2393                 zero = ctx->i8_0;
2394                 break;
2395         default:
2396                 unreachable(!"invalid bitsize");
2397                 break;
2398         }
2399
2400         LLVMValueRef params[2] = {
2401                 arg,
2402                 ctx->i1true,
2403         };
2404
2405         LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
2406                                               params, 2,
2407                                               AC_FUNC_ATTR_READNONE);
2408
2409         /* The HW returns the last bit index from MSB, but TGSI/NIR wants
2410          * the index from LSB. Invert it by doing "31 - msb". */
2411         msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
2412
2413         if (bitsize == 64) {
2414                 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
2415         } else if (bitsize < 32) {
2416                 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
2417         }
2418
2419         /* check for zero */
2420         return LLVMBuildSelect(ctx->builder,
2421                                LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
2422                                LLVMConstInt(ctx->i32, -1, true), msb, "");
2423 }
2424
2425 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
2426                            LLVMValueRef b)
2427 {
2428         char name[64];
2429         snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2430         LLVMValueRef args[2] = {a, b};
2431         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2432                                   AC_FUNC_ATTR_READNONE);
2433 }
2434
2435 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
2436                            LLVMValueRef b)
2437 {
2438         char name[64];
2439         snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2440         LLVMValueRef args[2] = {a, b};
2441         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2442                                   AC_FUNC_ATTR_READNONE);
2443 }
2444
2445 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2446                            LLVMValueRef b)
2447 {
2448         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2449         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2450 }
2451
2452 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2453                            LLVMValueRef b)
2454 {
2455         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2456         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2457 }
2458
2459 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2460                            LLVMValueRef b)
2461 {
2462         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2463         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2464 }
2465
2466 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
2467                            LLVMValueRef b)
2468 {
2469         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
2470         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2471 }
2472
2473 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2474 {
2475         LLVMTypeRef t = LLVMTypeOf(value);
2476         return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2477                              LLVMConstReal(t, 1.0));
2478 }
2479
2480 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2481 {
2482         LLVMValueRef args[9];
2483
2484         args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2485         args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2486
2487         if (a->compr) {
2488                 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2489                 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2490
2491                 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2492                                 v2i16, "");
2493                 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2494                                 v2i16, "");
2495                 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2496                 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2497
2498                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2499                                    ctx->voidt, args, 6, 0);
2500         } else {
2501                 args[2] = a->out[0];
2502                 args[3] = a->out[1];
2503                 args[4] = a->out[2];
2504                 args[5] = a->out[3];
2505                 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2506                 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2507
2508                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2509                                    ctx->voidt, args, 8, 0);
2510         }
2511 }
2512
2513 void ac_build_export_null(struct ac_llvm_context *ctx)
2514 {
2515         struct ac_export_args args;
2516
2517         args.enabled_channels = 0x0; /* enabled channels */
2518         args.valid_mask = 1; /* whether the EXEC mask is valid */
2519         args.done = 1; /* DONE bit */
2520         args.target = V_008DFC_SQ_EXP_NULL;
2521         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2522         args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2523         args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2524         args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2525         args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2526
2527         ac_build_export(ctx, &args);
2528 }
2529
2530 static unsigned ac_num_coords(enum ac_image_dim dim)
2531 {
2532         switch (dim) {
2533         case ac_image_1d:
2534                 return 1;
2535         case ac_image_2d:
2536         case ac_image_1darray:
2537                  return 2;
2538         case ac_image_3d:
2539         case ac_image_cube:
2540         case ac_image_2darray:
2541         case ac_image_2dmsaa:
2542                 return 3;
2543         case ac_image_2darraymsaa:
2544                 return 4;
2545         default:
2546                 unreachable("ac_num_coords: bad dim");
2547         }
2548 }
2549
2550 static unsigned ac_num_derivs(enum ac_image_dim dim)
2551 {
2552         switch (dim) {
2553         case ac_image_1d:
2554         case ac_image_1darray:
2555                 return 2;
2556         case ac_image_2d:
2557         case ac_image_2darray:
2558         case ac_image_cube:
2559                 return 4;
2560         case ac_image_3d:
2561                 return 6;
2562         case ac_image_2dmsaa:
2563         case ac_image_2darraymsaa:
2564         default:
2565                 unreachable("derivatives not supported");
2566         }
2567 }
2568
2569 static const char *get_atomic_name(enum ac_atomic_op op)
2570 {
2571         switch (op) {
2572         case ac_atomic_swap: return "swap";
2573         case ac_atomic_add: return "add";
2574         case ac_atomic_sub: return "sub";
2575         case ac_atomic_smin: return "smin";
2576         case ac_atomic_umin: return "umin";
2577         case ac_atomic_smax: return "smax";
2578         case ac_atomic_umax: return "umax";
2579         case ac_atomic_and: return "and";
2580         case ac_atomic_or: return "or";
2581         case ac_atomic_xor: return "xor";
2582         }
2583         unreachable("bad atomic op");
2584 }
2585
2586 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2587                                    struct ac_image_args *a)
2588 {
2589         const char *overload[3] = { "", "", "" };
2590         unsigned num_overloads = 0;
2591         LLVMValueRef args[18];
2592         unsigned num_args = 0;
2593         enum ac_image_dim dim = a->dim;
2594
2595         assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2596                !a->level_zero);
2597         assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2598                 a->opcode != ac_image_store_mip) ||
2599                a->lod);
2600         assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2601                (!a->compare && !a->offset));
2602         assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2603                 a->opcode == ac_image_get_lod) ||
2604                !a->bias);
2605         assert((a->bias ? 1 : 0) +
2606                (a->lod ? 1 : 0) +
2607                (a->level_zero ? 1 : 0) +
2608                (a->derivs[0] ? 1 : 0) <= 1);
2609
2610         if (a->opcode == ac_image_get_lod) {
2611                 switch (dim) {
2612                 case ac_image_1darray:
2613                         dim = ac_image_1d;
2614                         break;
2615                 case ac_image_2darray:
2616                 case ac_image_cube:
2617                         dim = ac_image_2d;
2618                         break;
2619                 default:
2620                         break;
2621                 }
2622         }
2623
2624         bool sample = a->opcode == ac_image_sample ||
2625                       a->opcode == ac_image_gather4 ||
2626                       a->opcode == ac_image_get_lod;
2627         bool atomic = a->opcode == ac_image_atomic ||
2628                       a->opcode == ac_image_atomic_cmpswap;
2629         bool load = a->opcode == ac_image_sample ||
2630                     a->opcode == ac_image_gather4 ||
2631                     a->opcode == ac_image_load ||
2632                     a->opcode == ac_image_load_mip;
2633         LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2634
2635         if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2636                 args[num_args++] = a->data[0];
2637                 if (a->opcode == ac_image_atomic_cmpswap)
2638                         args[num_args++] = a->data[1];
2639         }
2640
2641         if (!atomic)
2642                 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2643
2644         if (a->offset)
2645                 args[num_args++] = ac_to_integer(ctx, a->offset);
2646         if (a->bias) {
2647                 args[num_args++] = ac_to_float(ctx, a->bias);
2648                 overload[num_overloads++] = ".f32";
2649         }
2650         if (a->compare)
2651                 args[num_args++] = ac_to_float(ctx, a->compare);
2652         if (a->derivs[0]) {
2653                 unsigned count = ac_num_derivs(dim);
2654                 for (unsigned i = 0; i < count; ++i)
2655                         args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2656                 overload[num_overloads++] = ".f32";
2657         }
2658         unsigned num_coords =
2659                 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2660         for (unsigned i = 0; i < num_coords; ++i)
2661                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2662         if (a->lod)
2663                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2664         overload[num_overloads++] = sample ? ".f32" : ".i32";
2665
2666         args[num_args++] = a->resource;
2667         if (sample) {
2668                 args[num_args++] = a->sampler;
2669                 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2670         }
2671
2672         args[num_args++] = ctx->i32_0; /* texfailctrl */
2673         args[num_args++] = LLVMConstInt(ctx->i32,
2674                                         load ? get_load_cache_policy(ctx, a->cache_policy) :
2675                                                a->cache_policy, false);
2676
2677         const char *name;
2678         const char *atomic_subop = "";
2679         switch (a->opcode) {
2680         case ac_image_sample: name = "sample"; break;
2681         case ac_image_gather4: name = "gather4"; break;
2682         case ac_image_load: name = "load"; break;
2683         case ac_image_load_mip: name = "load.mip"; break;
2684         case ac_image_store: name = "store"; break;
2685         case ac_image_store_mip: name = "store.mip"; break;
2686         case ac_image_atomic:
2687                 name = "atomic.";
2688                 atomic_subop = get_atomic_name(a->atomic);
2689                 break;
2690         case ac_image_atomic_cmpswap:
2691                 name = "atomic.";
2692                 atomic_subop = "cmpswap";
2693                 break;
2694         case ac_image_get_lod: name = "getlod"; break;
2695         case ac_image_get_resinfo: name = "getresinfo"; break;
2696         default: unreachable("invalid image opcode");
2697         }
2698
2699         const char *dimname;
2700         switch (dim) {
2701         case ac_image_1d: dimname = "1d"; break;
2702         case ac_image_2d: dimname = "2d"; break;
2703         case ac_image_3d: dimname = "3d"; break;
2704         case ac_image_cube: dimname = "cube"; break;
2705         case ac_image_1darray: dimname = "1darray"; break;
2706         case ac_image_2darray: dimname = "2darray"; break;
2707         case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2708         case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2709         default: unreachable("invalid dim");
2710         }
2711
2712         bool lod_suffix =
2713                 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2714         char intr_name[96];
2715         snprintf(intr_name, sizeof(intr_name),
2716                  "llvm.amdgcn.image.%s%s" /* base name */
2717                  "%s%s%s" /* sample/gather modifiers */
2718                  ".%s.%s%s%s%s", /* dimension and type overloads */
2719                  name, atomic_subop,
2720                  a->compare ? ".c" : "",
2721                  a->bias ? ".b" :
2722                  lod_suffix ? ".l" :
2723                  a->derivs[0] ? ".d" :
2724                  a->level_zero ? ".lz" : "",
2725                  a->offset ? ".o" : "",
2726                  dimname,
2727                  atomic ? "i32" : "v4f32",
2728                  overload[0], overload[1], overload[2]);
2729
2730         LLVMTypeRef retty;
2731         if (atomic)
2732                 retty = ctx->i32;
2733         else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2734                 retty = ctx->voidt;
2735         else
2736                 retty = ctx->v4f32;
2737
2738         LLVMValueRef result =
2739                 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2740                                    a->attributes);
2741         if (!sample && retty == ctx->v4f32) {
2742                 result = LLVMBuildBitCast(ctx->builder, result,
2743                                           ctx->v4i32, "");
2744         }
2745         return result;
2746 }
2747
2748 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2749                                     LLVMValueRef args[2])
2750 {
2751         LLVMTypeRef v2f16 =
2752                 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2753
2754         return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2755                                   args, 2, AC_FUNC_ATTR_READNONE);
2756 }
2757
2758 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2759                                      LLVMValueRef args[2])
2760 {
2761         LLVMValueRef res =
2762                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2763                                    ctx->v2i16, args, 2,
2764                                    AC_FUNC_ATTR_READNONE);
2765         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2766 }
2767
2768 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2769                                      LLVMValueRef args[2])
2770 {
2771         LLVMValueRef res =
2772                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2773                                    ctx->v2i16, args, 2,
2774                                    AC_FUNC_ATTR_READNONE);
2775         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2776 }
2777
2778 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2779 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2780                                  LLVMValueRef args[2], unsigned bits, bool hi)
2781 {
2782         assert(bits == 8 || bits == 10 || bits == 16);
2783
2784         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2785                 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2786         LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2787                 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2788         LLVMValueRef max_alpha =
2789                 bits != 10 ? max_rgb : ctx->i32_1;
2790         LLVMValueRef min_alpha =
2791                 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2792
2793         /* Clamp. */
2794         if (bits != 16) {
2795                 for (int i = 0; i < 2; i++) {
2796                         bool alpha = hi && i == 1;
2797                         args[i] = ac_build_imin(ctx, args[i],
2798                                                 alpha ? max_alpha : max_rgb);
2799                         args[i] = ac_build_imax(ctx, args[i],
2800                                                 alpha ? min_alpha : min_rgb);
2801                 }
2802         }
2803
2804         LLVMValueRef res =
2805                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2806                                    ctx->v2i16, args, 2,
2807                                    AC_FUNC_ATTR_READNONE);
2808         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2809 }
2810
2811 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2812 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2813                                  LLVMValueRef args[2], unsigned bits, bool hi)
2814 {
2815         assert(bits == 8 || bits == 10 || bits == 16);
2816
2817         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2818                 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2819         LLVMValueRef max_alpha =
2820                 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2821
2822         /* Clamp. */
2823         if (bits != 16) {
2824                 for (int i = 0; i < 2; i++) {
2825                         bool alpha = hi && i == 1;
2826                         args[i] = ac_build_umin(ctx, args[i],
2827                                                 alpha ? max_alpha : max_rgb);
2828                 }
2829         }
2830
2831         LLVMValueRef res =
2832                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2833                                    ctx->v2i16, args, 2,
2834                                    AC_FUNC_ATTR_READNONE);
2835         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2836 }
2837
2838 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2839 {
2840         return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2841                                   &i1, 1, AC_FUNC_ATTR_READNONE);
2842 }
2843
2844 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2845 {
2846         ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2847                            &i1, 1, 0);
2848 }
2849
2850 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2851                           LLVMValueRef offset, LLVMValueRef width,
2852                           bool is_signed)
2853 {
2854         LLVMValueRef args[] = {
2855                 input,
2856                 offset,
2857                 width,
2858         };
2859
2860         LLVMValueRef result = ac_build_intrinsic(ctx,
2861                                                  is_signed ? "llvm.amdgcn.sbfe.i32" :
2862                                                              "llvm.amdgcn.ubfe.i32",
2863                                                  ctx->i32, args, 3,
2864                                                  AC_FUNC_ATTR_READNONE);
2865
2866         if (HAVE_LLVM < 0x0800) {
2867                 /* FIXME: LLVM 7+ returns incorrect result when count is 0.
2868                  * https://bugs.freedesktop.org/show_bug.cgi?id=107276
2869                  */
2870                 LLVMValueRef zero = ctx->i32_0;
2871                 LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, width, zero, "");
2872                 result = LLVMBuildSelect(ctx->builder, icond, zero, result, "");
2873         }
2874
2875         return result;
2876 }
2877
2878 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2879                            LLVMValueRef s1, LLVMValueRef s2)
2880 {
2881         return LLVMBuildAdd(ctx->builder,
2882                             LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2883 }
2884
2885 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2886                            LLVMValueRef s1, LLVMValueRef s2)
2887 {
2888         return LLVMBuildFAdd(ctx->builder,
2889                              LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2890 }
2891
2892 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2893 {
2894         if (!wait_flags)
2895                 return;
2896
2897         unsigned lgkmcnt = 63;
2898         unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2899         unsigned vscnt = 63;
2900
2901         if (wait_flags & AC_WAIT_LGKM)
2902                 lgkmcnt = 0;
2903         if (wait_flags & AC_WAIT_VLOAD)
2904                 vmcnt = 0;
2905
2906         if (wait_flags & AC_WAIT_VSTORE) {
2907                 if (ctx->chip_class >= GFX10)
2908                         vscnt = 0;
2909                 else
2910                         vmcnt = 0;
2911         }
2912
2913         /* There is no intrinsic for vscnt(0), so use a fence. */
2914         if ((wait_flags & AC_WAIT_LGKM &&
2915              wait_flags & AC_WAIT_VLOAD &&
2916              wait_flags & AC_WAIT_VSTORE) ||
2917             vscnt == 0) {
2918                 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2919                 return;
2920         }
2921
2922         unsigned simm16 = (lgkmcnt << 8) |
2923                           (7 << 4) | /* expcnt */
2924                           (vmcnt & 0xf) |
2925                           ((vmcnt >> 4) << 14);
2926
2927         LLVMValueRef args[1] = {
2928                 LLVMConstInt(ctx->i32, simm16, false),
2929         };
2930         ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2931                            ctx->voidt, args, 1, 0);
2932 }
2933
2934 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
2935                             LLVMValueRef src1, LLVMValueRef src2,
2936                             unsigned bitsize)
2937 {
2938         LLVMTypeRef type;
2939         char *intr;
2940
2941         if (bitsize == 16) {
2942                 intr = "llvm.amdgcn.fmed3.f16";
2943                 type = ctx->f16;
2944         } else if (bitsize == 32) {
2945                 intr = "llvm.amdgcn.fmed3.f32";
2946                 type = ctx->f32;
2947         } else {
2948                 intr = "llvm.amdgcn.fmed3.f64";
2949                 type = ctx->f64;
2950         }
2951
2952         LLVMValueRef params[] = {
2953                 src0,
2954                 src1,
2955                 src2,
2956         };
2957         return ac_build_intrinsic(ctx, intr, type, params, 3,
2958                                   AC_FUNC_ATTR_READNONE);
2959 }
2960
2961 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2962                             unsigned bitsize)
2963 {
2964         LLVMTypeRef type;
2965         char *intr;
2966
2967         if (bitsize == 16) {
2968                 intr = "llvm.amdgcn.fract.f16";
2969                 type = ctx->f16;
2970         } else if (bitsize == 32) {
2971                 intr = "llvm.amdgcn.fract.f32";
2972                 type = ctx->f32;
2973         } else {
2974                 intr = "llvm.amdgcn.fract.f64";
2975                 type = ctx->f64;
2976         }
2977
2978         LLVMValueRef params[] = {
2979                 src0,
2980         };
2981         return ac_build_intrinsic(ctx, intr, type, params, 1,
2982                                   AC_FUNC_ATTR_READNONE);
2983 }
2984
2985 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2986                             unsigned bitsize)
2987 {
2988         LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2989         LLVMValueRef zero = LLVMConstInt(type, 0, false);
2990         LLVMValueRef one = LLVMConstInt(type, 1, false);
2991
2992         LLVMValueRef cmp, val;
2993         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2994         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2995         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2996         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2997         return val;
2998 }
2999
3000 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
3001                             unsigned bitsize)
3002 {
3003         LLVMValueRef cmp, val, zero, one;
3004         LLVMTypeRef type;
3005
3006         if (bitsize == 16) {
3007                 type = ctx->f16;
3008                 zero = ctx->f16_0;
3009                 one = ctx->f16_1;
3010         } else if (bitsize == 32) {
3011                 type = ctx->f32;
3012                 zero = ctx->f32_0;
3013                 one = ctx->f32_1;
3014         } else {
3015                 type = ctx->f64;
3016                 zero = ctx->f64_0;
3017                 one = ctx->f64_1;
3018         }
3019
3020         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
3021         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
3022         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
3023         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
3024         return val;
3025 }
3026
3027 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
3028 {
3029         LLVMValueRef result;
3030         unsigned bitsize;
3031
3032         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3033
3034         switch (bitsize) {
3035         case 64:
3036                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
3037                                             (LLVMValueRef []) { src0 }, 1,
3038                                             AC_FUNC_ATTR_READNONE);
3039
3040                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3041                 break;
3042         case 32:
3043                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
3044                                             (LLVMValueRef []) { src0 }, 1,
3045                                             AC_FUNC_ATTR_READNONE);
3046                 break;
3047         case 16:
3048                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
3049                                             (LLVMValueRef []) { src0 }, 1,
3050                                             AC_FUNC_ATTR_READNONE);
3051
3052                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3053                 break;
3054         case 8:
3055                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
3056                                             (LLVMValueRef []) { src0 }, 1,
3057                                             AC_FUNC_ATTR_READNONE);
3058
3059                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3060                 break;
3061         default:
3062                 unreachable(!"invalid bitsize");
3063                 break;
3064         }
3065
3066         return result;
3067 }
3068
3069 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
3070                                        LLVMValueRef src0)
3071 {
3072         LLVMValueRef result;
3073         unsigned bitsize;
3074
3075         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3076
3077         switch (bitsize) {
3078         case 64:
3079                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
3080                                             (LLVMValueRef []) { src0 }, 1,
3081                                             AC_FUNC_ATTR_READNONE);
3082
3083                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3084                 break;
3085         case 32:
3086                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
3087                                             (LLVMValueRef []) { src0 }, 1,
3088                                             AC_FUNC_ATTR_READNONE);
3089                 break;
3090         case 16:
3091                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
3092                                             (LLVMValueRef []) { src0 }, 1,
3093                                             AC_FUNC_ATTR_READNONE);
3094
3095                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3096                 break;
3097         case 8:
3098                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
3099                                             (LLVMValueRef []) { src0 }, 1,
3100                                             AC_FUNC_ATTR_READNONE);
3101
3102                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3103                 break;
3104         default:
3105                 unreachable(!"invalid bitsize");
3106                 break;
3107         }
3108
3109         return result;
3110 }
3111
3112 #define AC_EXP_TARGET           0
3113 #define AC_EXP_ENABLED_CHANNELS 1
3114 #define AC_EXP_OUT0             2
3115
3116 enum ac_ir_type {
3117         AC_IR_UNDEF,
3118         AC_IR_CONST,
3119         AC_IR_VALUE,
3120 };
3121
3122 struct ac_vs_exp_chan
3123 {
3124         LLVMValueRef value;
3125         float const_float;
3126         enum ac_ir_type type;
3127 };
3128
3129 struct ac_vs_exp_inst {
3130         unsigned offset;
3131         LLVMValueRef inst;
3132         struct ac_vs_exp_chan chan[4];
3133 };
3134
3135 struct ac_vs_exports {
3136         unsigned num;
3137         struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
3138 };
3139
3140 /* Return true if the PARAM export has been eliminated. */
3141 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
3142                                       uint32_t num_outputs,
3143                                       struct ac_vs_exp_inst *exp)
3144 {
3145         unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
3146         bool is_zero[4] = {}, is_one[4] = {};
3147
3148         for (i = 0; i < 4; i++) {
3149                 /* It's a constant expression. Undef outputs are eliminated too. */
3150                 if (exp->chan[i].type == AC_IR_UNDEF) {
3151                         is_zero[i] = true;
3152                         is_one[i] = true;
3153                 } else if (exp->chan[i].type == AC_IR_CONST) {
3154                         if (exp->chan[i].const_float == 0)
3155                                 is_zero[i] = true;
3156                         else if (exp->chan[i].const_float == 1)
3157                                 is_one[i] = true;
3158                         else
3159                                 return false; /* other constant */
3160                 } else
3161                         return false;
3162         }
3163
3164         /* Only certain combinations of 0 and 1 can be eliminated. */
3165         if (is_zero[0] && is_zero[1] && is_zero[2])
3166                 default_val = is_zero[3] ? 0 : 1;
3167         else if (is_one[0] && is_one[1] && is_one[2])
3168                 default_val = is_zero[3] ? 2 : 3;
3169         else
3170                 return false;
3171
3172         /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
3173         LLVMInstructionEraseFromParent(exp->inst);
3174
3175         /* Change OFFSET to DEFAULT_VAL. */
3176         for (i = 0; i < num_outputs; i++) {
3177                 if (vs_output_param_offset[i] == exp->offset) {
3178                         vs_output_param_offset[i] =
3179                                 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
3180                         break;
3181                 }
3182         }
3183         return true;
3184 }
3185
3186 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
3187                                            uint8_t *vs_output_param_offset,
3188                                            uint32_t num_outputs,
3189                                            struct ac_vs_exports *processed,
3190                                            struct ac_vs_exp_inst *exp)
3191 {
3192         unsigned p, copy_back_channels = 0;
3193
3194         /* See if the output is already in the list of processed outputs.
3195          * The LLVMValueRef comparison relies on SSA.
3196          */
3197         for (p = 0; p < processed->num; p++) {
3198                 bool different = false;
3199
3200                 for (unsigned j = 0; j < 4; j++) {
3201                         struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
3202                         struct ac_vs_exp_chan *c2 = &exp->chan[j];
3203
3204                         /* Treat undef as a match. */
3205                         if (c2->type == AC_IR_UNDEF)
3206                                 continue;
3207
3208                         /* If c1 is undef but c2 isn't, we can copy c2 to c1
3209                          * and consider the instruction duplicated.
3210                          */
3211                         if (c1->type == AC_IR_UNDEF) {
3212                                 copy_back_channels |= 1 << j;
3213                                 continue;
3214                         }
3215
3216                         /* Test whether the channels are not equal. */
3217                         if (c1->type != c2->type ||
3218                             (c1->type == AC_IR_CONST &&
3219                              c1->const_float != c2->const_float) ||
3220                             (c1->type == AC_IR_VALUE &&
3221                              c1->value != c2->value)) {
3222                                 different = true;
3223                                 break;
3224                         }
3225                 }
3226                 if (!different)
3227                         break;
3228
3229                 copy_back_channels = 0;
3230         }
3231         if (p == processed->num)
3232                 return false;
3233
3234         /* If a match was found, but the matching export has undef where the new
3235          * one has a normal value, copy the normal value to the undef channel.
3236          */
3237         struct ac_vs_exp_inst *match = &processed->exp[p];
3238
3239         /* Get current enabled channels mask. */
3240         LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
3241         unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
3242
3243         while (copy_back_channels) {
3244                 unsigned chan = u_bit_scan(&copy_back_channels);
3245
3246                 assert(match->chan[chan].type == AC_IR_UNDEF);
3247                 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
3248                                exp->chan[chan].value);
3249                 match->chan[chan] = exp->chan[chan];
3250
3251                 /* Update number of enabled channels because the original mask
3252                  * is not always 0xf.
3253                  */
3254                 enabled_channels |= (1 << chan);
3255                 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
3256                                LLVMConstInt(ctx->i32, enabled_channels, 0));
3257         }
3258
3259         /* The PARAM export is duplicated. Kill it. */
3260         LLVMInstructionEraseFromParent(exp->inst);
3261
3262         /* Change OFFSET to the matching export. */
3263         for (unsigned i = 0; i < num_outputs; i++) {
3264                 if (vs_output_param_offset[i] == exp->offset) {
3265                         vs_output_param_offset[i] = match->offset;
3266                         break;
3267                 }
3268         }
3269         return true;
3270 }
3271
3272 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
3273                             LLVMValueRef main_fn,
3274                             uint8_t *vs_output_param_offset,
3275                             uint32_t num_outputs,
3276                             uint8_t *num_param_exports)
3277 {
3278         LLVMBasicBlockRef bb;
3279         bool removed_any = false;
3280         struct ac_vs_exports exports;
3281
3282         exports.num = 0;
3283
3284         /* Process all LLVM instructions. */
3285         bb = LLVMGetFirstBasicBlock(main_fn);
3286         while (bb) {
3287                 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
3288
3289                 while (inst) {
3290                         LLVMValueRef cur = inst;
3291                         inst = LLVMGetNextInstruction(inst);
3292                         struct ac_vs_exp_inst exp;
3293
3294                         if (LLVMGetInstructionOpcode(cur) != LLVMCall)
3295                                 continue;
3296
3297                         LLVMValueRef callee = ac_llvm_get_called_value(cur);
3298
3299                         if (!ac_llvm_is_function(callee))
3300                                 continue;
3301
3302                         const char *name = LLVMGetValueName(callee);
3303                         unsigned num_args = LLVMCountParams(callee);
3304
3305                         /* Check if this is an export instruction. */
3306                         if ((num_args != 9 && num_args != 8) ||
3307                             (strcmp(name, "llvm.SI.export") &&
3308                              strcmp(name, "llvm.amdgcn.exp.f32")))
3309                                 continue;
3310
3311                         LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
3312                         unsigned target = LLVMConstIntGetZExtValue(arg);
3313
3314                         if (target < V_008DFC_SQ_EXP_PARAM)
3315                                 continue;
3316
3317                         target -= V_008DFC_SQ_EXP_PARAM;
3318
3319                         /* Parse the instruction. */
3320                         memset(&exp, 0, sizeof(exp));
3321                         exp.offset = target;
3322                         exp.inst = cur;
3323
3324                         for (unsigned i = 0; i < 4; i++) {
3325                                 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
3326
3327                                 exp.chan[i].value = v;
3328
3329                                 if (LLVMIsUndef(v)) {
3330                                         exp.chan[i].type = AC_IR_UNDEF;
3331                                 } else if (LLVMIsAConstantFP(v)) {
3332                                         LLVMBool loses_info;
3333                                         exp.chan[i].type = AC_IR_CONST;
3334                                         exp.chan[i].const_float =
3335                                                 LLVMConstRealGetDouble(v, &loses_info);
3336                                 } else {
3337                                         exp.chan[i].type = AC_IR_VALUE;
3338                                 }
3339                         }
3340
3341                         /* Eliminate constant and duplicated PARAM exports. */
3342                         if (ac_eliminate_const_output(vs_output_param_offset,
3343                                                       num_outputs, &exp) ||
3344                             ac_eliminate_duplicated_output(ctx,
3345                                                            vs_output_param_offset,
3346                                                            num_outputs, &exports,
3347                                                            &exp)) {
3348                                 removed_any = true;
3349                         } else {
3350                                 exports.exp[exports.num++] = exp;
3351                         }
3352                 }
3353                 bb = LLVMGetNextBasicBlock(bb);
3354         }
3355
3356         /* Remove holes in export memory due to removed PARAM exports.
3357          * This is done by renumbering all PARAM exports.
3358          */
3359         if (removed_any) {
3360                 uint8_t old_offset[VARYING_SLOT_MAX];
3361                 unsigned out, i;
3362
3363                 /* Make a copy of the offsets. We need the old version while
3364                  * we are modifying some of them. */
3365                 memcpy(old_offset, vs_output_param_offset,
3366                        sizeof(old_offset));
3367
3368                 for (i = 0; i < exports.num; i++) {
3369                         unsigned offset = exports.exp[i].offset;
3370
3371                         /* Update vs_output_param_offset. Multiple outputs can
3372                          * have the same offset.
3373                          */
3374                         for (out = 0; out < num_outputs; out++) {
3375                                 if (old_offset[out] == offset)
3376                                         vs_output_param_offset[out] = i;
3377                         }
3378
3379                         /* Change the PARAM offset in the instruction. */
3380                         LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
3381                                        LLVMConstInt(ctx->i32,
3382                                                     V_008DFC_SQ_EXP_PARAM + i, 0));
3383                 }
3384                 *num_param_exports = exports.num;
3385         }
3386 }
3387
3388 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
3389 {
3390         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
3391         ac_build_intrinsic(ctx,
3392                            "llvm.amdgcn.init.exec", ctx->voidt,
3393                            &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
3394 }
3395
3396 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
3397 {
3398         unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
3399         ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
3400                                      LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
3401                                      "lds");
3402 }
3403
3404 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
3405                          LLVMValueRef dw_addr)
3406 {
3407         return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
3408 }
3409
3410 void ac_lds_store(struct ac_llvm_context *ctx,
3411                   LLVMValueRef dw_addr,
3412                   LLVMValueRef value)
3413 {
3414         value = ac_to_integer(ctx, value);
3415         ac_build_indexed_store(ctx, ctx->lds,
3416                                dw_addr, value);
3417 }
3418
3419 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
3420                          LLVMTypeRef dst_type,
3421                          LLVMValueRef src0)
3422 {
3423         unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3424         const char *intrin_name;
3425         LLVMTypeRef type;
3426         LLVMValueRef zero;
3427
3428         switch (src0_bitsize) {
3429         case 64:
3430                 intrin_name = "llvm.cttz.i64";
3431                 type = ctx->i64;
3432                 zero = ctx->i64_0;
3433                 break;
3434         case 32:
3435                 intrin_name = "llvm.cttz.i32";
3436                 type = ctx->i32;
3437                 zero = ctx->i32_0;
3438                 break;
3439         case 16:
3440                 intrin_name = "llvm.cttz.i16";
3441                 type = ctx->i16;
3442                 zero = ctx->i16_0;
3443                 break;
3444         case 8:
3445                 intrin_name = "llvm.cttz.i8";
3446                 type = ctx->i8;
3447                 zero = ctx->i8_0;
3448                 break;
3449         default:
3450                 unreachable(!"invalid bitsize");
3451         }
3452
3453         LLVMValueRef params[2] = {
3454                 src0,
3455
3456                 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3457                  * add special code to check for x=0. The reason is that
3458                  * the LLVM behavior for x=0 is different from what we
3459                  * need here. However, LLVM also assumes that ffs(x) is
3460                  * in [0, 31], but GLSL expects that ffs(0) = -1, so
3461                  * a conditional assignment to handle 0 is still required.
3462                  *
3463                  * The hardware already implements the correct behavior.
3464                  */
3465                 ctx->i1true,
3466         };
3467
3468         LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
3469                                               params, 2,
3470                                               AC_FUNC_ATTR_READNONE);
3471
3472         if (src0_bitsize == 64) {
3473                 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3474         } else if (src0_bitsize < 32) {
3475                 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3476         }
3477
3478         /* TODO: We need an intrinsic to skip this conditional. */
3479         /* Check for zero: */
3480         return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
3481                                                            LLVMIntEQ, src0,
3482                                                            zero, ""),
3483                                LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3484 }
3485
3486 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3487 {
3488         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3489 }
3490
3491 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3492 {
3493         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3494 }
3495
3496 static struct ac_llvm_flow *
3497 get_current_flow(struct ac_llvm_context *ctx)
3498 {
3499         if (ctx->flow->depth > 0)
3500                 return &ctx->flow->stack[ctx->flow->depth - 1];
3501         return NULL;
3502 }
3503
3504 static struct ac_llvm_flow *
3505 get_innermost_loop(struct ac_llvm_context *ctx)
3506 {
3507         for (unsigned i = ctx->flow->depth; i > 0; --i) {
3508                 if (ctx->flow->stack[i - 1].loop_entry_block)
3509                         return &ctx->flow->stack[i - 1];
3510         }
3511         return NULL;
3512 }
3513
3514 static struct ac_llvm_flow *
3515 push_flow(struct ac_llvm_context *ctx)
3516 {
3517         struct ac_llvm_flow *flow;
3518
3519         if (ctx->flow->depth >= ctx->flow->depth_max) {
3520                 unsigned new_max = MAX2(ctx->flow->depth << 1,
3521                                         AC_LLVM_INITIAL_CF_DEPTH);
3522
3523                 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
3524                 ctx->flow->depth_max = new_max;
3525         }
3526
3527         flow = &ctx->flow->stack[ctx->flow->depth];
3528         ctx->flow->depth++;
3529
3530         flow->next_block = NULL;
3531         flow->loop_entry_block = NULL;
3532         return flow;
3533 }
3534
3535 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
3536                                 int label_id)
3537 {
3538         char buf[32];
3539         snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3540         LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3541 }
3542
3543 /* Append a basic block at the level of the parent flow.
3544  */
3545 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
3546                                             const char *name)
3547 {
3548         assert(ctx->flow->depth >= 1);
3549
3550         if (ctx->flow->depth >= 2) {
3551                 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
3552
3553                 return LLVMInsertBasicBlockInContext(ctx->context,
3554                                                      flow->next_block, name);
3555         }
3556
3557         LLVMValueRef main_fn =
3558                 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3559         return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3560 }
3561
3562 /* Emit a branch to the given default target for the current block if
3563  * applicable -- that is, if the current block does not already contain a
3564  * branch from a break or continue.
3565  */
3566 static void emit_default_branch(LLVMBuilderRef builder,
3567                                 LLVMBasicBlockRef target)
3568 {
3569         if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3570                  LLVMBuildBr(builder, target);
3571 }
3572
3573 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3574 {
3575         struct ac_llvm_flow *flow = push_flow(ctx);
3576         flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3577         flow->next_block = append_basic_block(ctx, "ENDLOOP");
3578         set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3579         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3580         LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3581 }
3582
3583 void ac_build_break(struct ac_llvm_context *ctx)
3584 {
3585         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3586         LLVMBuildBr(ctx->builder, flow->next_block);
3587 }
3588
3589 void ac_build_continue(struct ac_llvm_context *ctx)
3590 {
3591         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3592         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3593 }
3594
3595 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3596 {
3597         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3598         LLVMBasicBlockRef endif_block;
3599
3600         assert(!current_branch->loop_entry_block);
3601
3602         endif_block = append_basic_block(ctx, "ENDIF");
3603         emit_default_branch(ctx->builder, endif_block);
3604
3605         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3606         set_basicblock_name(current_branch->next_block, "else", label_id);
3607
3608         current_branch->next_block = endif_block;
3609 }
3610
3611 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3612 {
3613         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3614
3615         assert(!current_branch->loop_entry_block);
3616
3617         emit_default_branch(ctx->builder, current_branch->next_block);
3618         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3619         set_basicblock_name(current_branch->next_block, "endif", label_id);
3620
3621         ctx->flow->depth--;
3622 }
3623
3624 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3625 {
3626         struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3627
3628         assert(current_loop->loop_entry_block);
3629
3630         emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3631
3632         LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3633         set_basicblock_name(current_loop->next_block, "endloop", label_id);
3634         ctx->flow->depth--;
3635 }
3636
3637 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3638 {
3639         struct ac_llvm_flow *flow = push_flow(ctx);
3640         LLVMBasicBlockRef if_block;
3641
3642         if_block = append_basic_block(ctx, "IF");
3643         flow->next_block = append_basic_block(ctx, "ELSE");
3644         set_basicblock_name(if_block, "if", label_id);
3645         LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3646         LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3647 }
3648
3649 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3650                  int label_id)
3651 {
3652         LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3653                                           value, ctx->f32_0, "");
3654         ac_build_ifcc(ctx, cond, label_id);
3655 }
3656
3657 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3658                   int label_id)
3659 {
3660         LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3661                                           ac_to_integer(ctx, value),
3662                                           ctx->i32_0, "");
3663         ac_build_ifcc(ctx, cond, label_id);
3664 }
3665
3666 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3667                              const char *name)
3668 {
3669         LLVMBuilderRef builder = ac->builder;
3670         LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3671         LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3672         LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3673         LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3674         LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3675         LLVMValueRef res;
3676
3677         if (first_instr) {
3678                 LLVMPositionBuilderBefore(first_builder, first_instr);
3679         } else {
3680                 LLVMPositionBuilderAtEnd(first_builder, first_block);
3681         }
3682
3683         res = LLVMBuildAlloca(first_builder, type, name);
3684         LLVMDisposeBuilder(first_builder);
3685         return res;
3686 }
3687
3688 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3689                                    LLVMTypeRef type, const char *name)
3690 {
3691         LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3692         LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3693         return ptr;
3694 }
3695
3696 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3697                          LLVMTypeRef type)
3698 {
3699         int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3700         return LLVMBuildBitCast(ctx->builder, ptr,
3701                                 LLVMPointerType(type, addr_space), "");
3702 }
3703
3704 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3705                             unsigned count)
3706 {
3707         unsigned num_components = ac_get_llvm_num_components(value);
3708         if (count == num_components)
3709                 return value;
3710
3711         LLVMValueRef masks[MAX2(count, 2)];
3712         masks[0] = ctx->i32_0;
3713         masks[1] = ctx->i32_1;
3714         for (unsigned i = 2; i < count; i++)
3715                 masks[i] = LLVMConstInt(ctx->i32, i, false);
3716
3717         if (count == 1)
3718                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3719                                                "");
3720
3721         LLVMValueRef swizzle = LLVMConstVector(masks, count);
3722         return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3723 }
3724
3725 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3726                              unsigned rshift, unsigned bitwidth)
3727 {
3728         LLVMValueRef value = param;
3729         if (rshift)
3730                 value = LLVMBuildLShr(ctx->builder, value,
3731                                       LLVMConstInt(ctx->i32, rshift, false), "");
3732
3733         if (rshift + bitwidth < 32) {
3734                 unsigned mask = (1 << bitwidth) - 1;
3735                 value = LLVMBuildAnd(ctx->builder, value,
3736                                      LLVMConstInt(ctx->i32, mask, false), "");
3737         }
3738         return value;
3739 }
3740
3741 /* Adjust the sample index according to FMASK.
3742  *
3743  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3744  * which is the identity mapping. Each nibble says which physical sample
3745  * should be fetched to get that sample.
3746  *
3747  * For example, 0x11111100 means there are only 2 samples stored and
3748  * the second sample covers 3/4 of the pixel. When reading samples 0
3749  * and 1, return physical sample 0 (determined by the first two 0s
3750  * in FMASK), otherwise return physical sample 1.
3751  *
3752  * The sample index should be adjusted as follows:
3753  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3754  */
3755 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3756                               LLVMValueRef *addr, bool is_array_tex)
3757 {
3758         struct ac_image_args fmask_load = {};
3759         fmask_load.opcode = ac_image_load;
3760         fmask_load.resource = fmask;
3761         fmask_load.dmask = 0xf;
3762         fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3763         fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3764
3765         fmask_load.coords[0] = addr[0];
3766         fmask_load.coords[1] = addr[1];
3767         if (is_array_tex)
3768                 fmask_load.coords[2] = addr[2];
3769
3770         LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3771         fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3772                                               ac->i32_0, "");
3773
3774         /* Apply the formula. */
3775         unsigned sample_chan = is_array_tex ? 3 : 2;
3776         LLVMValueRef final_sample;
3777         final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3778                                     LLVMConstInt(ac->i32, 4, 0), "");
3779         final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3780         /* Mask the sample index by 0x7, because 0x8 means an unknown value
3781          * with EQAA, so those will map to 0. */
3782         final_sample = LLVMBuildAnd(ac->builder, final_sample,
3783                                     LLVMConstInt(ac->i32, 0x7, 0), "");
3784
3785         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3786          * resource descriptor is 0 (invalid).
3787          */
3788         LLVMValueRef tmp;
3789         tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3790         tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3791         tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3792
3793         /* Replace the MSAA sample index. */
3794         addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3795                                             addr[sample_chan], "");
3796 }
3797
3798 static LLVMValueRef
3799 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3800 {
3801         ac_build_optimization_barrier(ctx, &src);
3802         return ac_build_intrinsic(ctx,
3803                         lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3804                         LLVMTypeOf(src), (LLVMValueRef []) {
3805                         src, lane },
3806                         lane == NULL ? 1 : 2,
3807                         AC_FUNC_ATTR_READNONE |
3808                         AC_FUNC_ATTR_CONVERGENT);
3809 }
3810
3811 /**
3812  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3813  * @param ctx
3814  * @param src
3815  * @param lane - id of the lane or NULL for the first active lane
3816  * @return value of the lane
3817  */
3818 LLVMValueRef
3819 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3820 {
3821         LLVMTypeRef src_type = LLVMTypeOf(src);
3822         src = ac_to_integer(ctx, src);
3823         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3824         LLVMValueRef ret;
3825
3826         if (bits == 32) {
3827                 ret = _ac_build_readlane(ctx, src, lane);
3828         } else {
3829                 assert(bits % 32 == 0);
3830                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3831                 LLVMValueRef src_vector =
3832                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3833                 ret = LLVMGetUndef(vec_type);
3834                 for (unsigned i = 0; i < bits / 32; i++) {
3835                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3836                                                 LLVMConstInt(ctx->i32, i, 0), "");
3837                         LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3838                         ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3839                                                 LLVMConstInt(ctx->i32, i, 0), "");
3840                 }
3841         }
3842         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3843 }
3844
3845 LLVMValueRef
3846 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3847 {
3848         if (HAVE_LLVM >= 0x0800) {
3849                 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3850                                           (LLVMValueRef []) {value, lane, src}, 3,
3851                                           AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3852         }
3853
3854         LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
3855                                           ac_get_thread_id(ctx), "");
3856         return LLVMBuildSelect(ctx->builder, pred, value, src, "");
3857 }
3858
3859 LLVMValueRef
3860 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3861 {
3862         if (ctx->wave_size == 32) {
3863                 return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3864                                           (LLVMValueRef []) { mask, ctx->i32_0 },
3865                                           2, AC_FUNC_ATTR_READNONE);
3866         }
3867         LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3868                                                  LLVMVectorType(ctx->i32, 2),
3869                                                  "");
3870         LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3871                                                        ctx->i32_0, "");
3872         LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3873                                                        ctx->i32_1, "");
3874         LLVMValueRef val =
3875                 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3876                                    (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3877                                    2, AC_FUNC_ATTR_READNONE);
3878         val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3879                                  (LLVMValueRef []) { mask_hi, val },
3880                                  2, AC_FUNC_ATTR_READNONE);
3881         return val;
3882 }
3883
3884 enum dpp_ctrl {
3885         _dpp_quad_perm = 0x000,
3886         _dpp_row_sl = 0x100,
3887         _dpp_row_sr = 0x110,
3888         _dpp_row_rr = 0x120,
3889         dpp_wf_sl1 = 0x130,
3890         dpp_wf_rl1 = 0x134,
3891         dpp_wf_sr1 = 0x138,
3892         dpp_wf_rr1 = 0x13C,
3893         dpp_row_mirror = 0x140,
3894         dpp_row_half_mirror = 0x141,
3895         dpp_row_bcast15 = 0x142,
3896         dpp_row_bcast31 = 0x143
3897 };
3898
3899 static inline enum dpp_ctrl
3900 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3901 {
3902         assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3903         return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3904 }
3905
3906 static inline enum dpp_ctrl
3907 dpp_row_sl(unsigned amount)
3908 {
3909         assert(amount > 0 && amount < 16);
3910         return _dpp_row_sl | amount;
3911 }
3912
3913 static inline enum dpp_ctrl
3914 dpp_row_sr(unsigned amount)
3915 {
3916         assert(amount > 0 && amount < 16);
3917         return _dpp_row_sr | amount;
3918 }
3919
3920 static LLVMValueRef
3921 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3922               enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3923               bool bound_ctrl)
3924 {
3925         return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3926                                         LLVMTypeOf(old),
3927                                         (LLVMValueRef[]) {
3928                                                 old, src,
3929                                                 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3930                                                 LLVMConstInt(ctx->i32, row_mask, 0),
3931                                                 LLVMConstInt(ctx->i32, bank_mask, 0),
3932                                                 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3933                                         6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3934 }
3935
3936 static LLVMValueRef
3937 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3938              enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3939              bool bound_ctrl)
3940 {
3941         LLVMTypeRef src_type = LLVMTypeOf(src);
3942         src = ac_to_integer(ctx, src);
3943         old = ac_to_integer(ctx, old);
3944         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3945         LLVMValueRef ret;
3946         if (bits == 32) {
3947                 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3948                                     bank_mask, bound_ctrl);
3949         } else {
3950                 assert(bits % 32 == 0);
3951                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3952                 LLVMValueRef src_vector =
3953                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3954                 LLVMValueRef old_vector =
3955                         LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3956                 ret = LLVMGetUndef(vec_type);
3957                 for (unsigned i = 0; i < bits / 32; i++) {
3958                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3959                                                       LLVMConstInt(ctx->i32, i,
3960                                                                    0), "");
3961                         old = LLVMBuildExtractElement(ctx->builder, old_vector,
3962                                                       LLVMConstInt(ctx->i32, i,
3963                                                                    0), "");
3964                         LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3965                                                               dpp_ctrl,
3966                                                               row_mask,
3967                                                               bank_mask,
3968                                                               bound_ctrl);
3969                         ret = LLVMBuildInsertElement(ctx->builder, ret,
3970                                                      ret_comp,
3971                                                      LLVMConstInt(ctx->i32, i,
3972                                                                   0), "");
3973                 }
3974         }
3975         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3976 }
3977
3978 static LLVMValueRef
3979 _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3980                      bool exchange_rows, bool bound_ctrl)
3981 {
3982         LLVMValueRef args[6] = {
3983                 src,
3984                 src,
3985                 LLVMConstInt(ctx->i32, sel, false),
3986                 LLVMConstInt(ctx->i32, sel >> 32, false),
3987                 ctx->i1true, /* fi */
3988                 bound_ctrl ? ctx->i1true : ctx->i1false,
3989         };
3990         return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
3991                                                      : "llvm.amdgcn.permlane16",
3992                                   ctx->i32, args, 6,
3993                                   AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3994 }
3995
3996 static LLVMValueRef
3997 ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3998                     bool exchange_rows, bool bound_ctrl)
3999 {
4000         LLVMTypeRef src_type = LLVMTypeOf(src);
4001         src = ac_to_integer(ctx, src);
4002         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
4003         LLVMValueRef ret;
4004         if (bits == 32) {
4005                 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
4006                                            bound_ctrl);
4007         } else {
4008                 assert(bits % 32 == 0);
4009                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
4010                 LLVMValueRef src_vector =
4011                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
4012                 ret = LLVMGetUndef(vec_type);
4013                 for (unsigned i = 0; i < bits / 32; i++) {
4014                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
4015                                                       LLVMConstInt(ctx->i32, i,
4016                                                                    0), "");
4017                         LLVMValueRef ret_comp =
4018                                 _ac_build_permlane16(ctx, src, sel,
4019                                                      exchange_rows,
4020                                                      bound_ctrl);
4021                         ret = LLVMBuildInsertElement(ctx->builder, ret,
4022                                                      ret_comp,
4023                                                      LLVMConstInt(ctx->i32, i,
4024                                                                   0), "");
4025                 }
4026         }
4027         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4028 }
4029
4030 static inline unsigned
4031 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
4032 {
4033         assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
4034         return and_mask | (or_mask << 5) | (xor_mask << 10);
4035 }
4036
4037 static LLVMValueRef
4038 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4039 {
4040         return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
4041                                    LLVMTypeOf(src), (LLVMValueRef []) {
4042                                         src, LLVMConstInt(ctx->i32, mask, 0) },
4043                                    2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4044 }
4045
4046 LLVMValueRef
4047 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4048 {
4049         LLVMTypeRef src_type = LLVMTypeOf(src);
4050         src = ac_to_integer(ctx, src);
4051         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
4052         LLVMValueRef ret;
4053         if (bits == 32) {
4054                 ret = _ac_build_ds_swizzle(ctx, src, mask);
4055         } else {
4056                 assert(bits % 32 == 0);
4057                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
4058                 LLVMValueRef src_vector =
4059                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
4060                 ret = LLVMGetUndef(vec_type);
4061                 for (unsigned i = 0; i < bits / 32; i++) {
4062                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
4063                                                       LLVMConstInt(ctx->i32, i,
4064                                                                    0), "");
4065                         LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
4066                                                                      mask);
4067                         ret = LLVMBuildInsertElement(ctx->builder, ret,
4068                                                      ret_comp,
4069                                                      LLVMConstInt(ctx->i32, i,
4070                                                                   0), "");
4071                 }
4072         }
4073         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4074 }
4075
4076 static LLVMValueRef
4077 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
4078 {
4079         char name[32], type[8];
4080         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4081         snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
4082         return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
4083                                   (LLVMValueRef []) { src }, 1,
4084                                   AC_FUNC_ATTR_READNONE);
4085 }
4086
4087 static LLVMValueRef
4088 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
4089                       LLVMValueRef inactive)
4090 {
4091         char name[33], type[8];
4092         LLVMTypeRef src_type = LLVMTypeOf(src);
4093         src = ac_to_integer(ctx, src);
4094         inactive = ac_to_integer(ctx, inactive);
4095         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4096         snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
4097         LLVMValueRef ret =
4098                 ac_build_intrinsic(ctx, name,
4099                                         LLVMTypeOf(src), (LLVMValueRef []) {
4100                                         src, inactive }, 2,
4101                                         AC_FUNC_ATTR_READNONE |
4102                                         AC_FUNC_ATTR_CONVERGENT);
4103         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4104 }
4105
4106 static LLVMValueRef
4107 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
4108 {
4109         if (type_size == 4) {
4110                 switch (op) {
4111                 case nir_op_iadd: return ctx->i32_0;
4112                 case nir_op_fadd: return ctx->f32_0;
4113                 case nir_op_imul: return ctx->i32_1;
4114                 case nir_op_fmul: return ctx->f32_1;
4115                 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
4116                 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
4117                 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
4118                 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
4119                 case nir_op_umax: return ctx->i32_0;
4120                 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
4121                 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
4122                 case nir_op_ior: return ctx->i32_0;
4123                 case nir_op_ixor: return ctx->i32_0;
4124                 default:
4125                         unreachable("bad reduction intrinsic");
4126                 }
4127         } else { /* type_size == 64bit */
4128                 switch (op) {
4129                 case nir_op_iadd: return ctx->i64_0;
4130                 case nir_op_fadd: return ctx->f64_0;
4131                 case nir_op_imul: return ctx->i64_1;
4132                 case nir_op_fmul: return ctx->f64_1;
4133                 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
4134                 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4135                 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
4136                 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
4137                 case nir_op_umax: return ctx->i64_0;
4138                 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
4139                 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
4140                 case nir_op_ior: return ctx->i64_0;
4141                 case nir_op_ixor: return ctx->i64_0;
4142                 default:
4143                         unreachable("bad reduction intrinsic");
4144                 }
4145         }
4146 }
4147
4148 static LLVMValueRef
4149 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
4150 {
4151         bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
4152         switch (op) {
4153         case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
4154         case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
4155         case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
4156         case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
4157         case nir_op_imin: return LLVMBuildSelect(ctx->builder,
4158                                         LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
4159                                         lhs, rhs, "");
4160         case nir_op_umin: return LLVMBuildSelect(ctx->builder,
4161                                         LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
4162                                         lhs, rhs, "");
4163         case nir_op_fmin: return ac_build_intrinsic(ctx,
4164                                         _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
4165                                         _64bit ? ctx->f64 : ctx->f32,
4166                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4167         case nir_op_imax: return LLVMBuildSelect(ctx->builder,
4168                                         LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
4169                                         lhs, rhs, "");
4170         case nir_op_umax: return LLVMBuildSelect(ctx->builder,
4171                                         LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
4172                                         lhs, rhs, "");
4173         case nir_op_fmax: return ac_build_intrinsic(ctx,
4174                                         _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
4175                                         _64bit ? ctx->f64 : ctx->f32,
4176                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4177         case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
4178         case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
4179         case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
4180         default:
4181                 unreachable("bad reduction intrinsic");
4182         }
4183 }
4184
4185 /**
4186  * \param maxprefix specifies that the result only needs to be correct for a
4187  *     prefix of this many threads
4188  *
4189  * TODO: add inclusive and excluse scan functions for GFX6.
4190  */
4191 static LLVMValueRef
4192 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
4193               unsigned maxprefix, bool inclusive)
4194 {
4195         LLVMValueRef result, tmp;
4196
4197         if (ctx->chip_class >= GFX10) {
4198                 result = inclusive ? src : identity;
4199         } else {
4200                 if (inclusive)
4201                         result = src;
4202                 else
4203                         result = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
4204         }
4205         if (maxprefix <= 1)
4206                 return result;
4207         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
4208         result = ac_build_alu_op(ctx, result, tmp, op);
4209         if (maxprefix <= 2)
4210                 return result;
4211         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
4212         result = ac_build_alu_op(ctx, result, tmp, op);
4213         if (maxprefix <= 3)
4214                 return result;
4215         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
4216         result = ac_build_alu_op(ctx, result, tmp, op);
4217         if (maxprefix <= 4)
4218                 return result;
4219         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4220         result = ac_build_alu_op(ctx, result, tmp, op);
4221         if (maxprefix <= 8)
4222                 return result;
4223         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4224         result = ac_build_alu_op(ctx, result, tmp, op);
4225         if (maxprefix <= 16)
4226                 return result;
4227
4228         if (ctx->chip_class >= GFX10) {
4229                 /* dpp_row_bcast{15,31} are not supported on gfx10. */
4230                 LLVMBuilderRef builder = ctx->builder;
4231                 LLVMValueRef tid = ac_get_thread_id(ctx);
4232                 LLVMValueRef cc;
4233                 /* TODO-GFX10: Can we get better code-gen by putting this into
4234                  * a branch so that LLVM generates EXEC mask manipulations? */
4235                 if (inclusive)
4236                         tmp = result;
4237                 else
4238                         tmp = ac_build_alu_op(ctx, result, src, op);
4239                 tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
4240                 tmp = ac_build_alu_op(ctx, result, tmp, op);
4241                 cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
4242                 cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
4243                 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4244                 if (maxprefix <= 32)
4245                         return result;
4246
4247                 if (inclusive)
4248                         tmp = result;
4249                 else
4250                         tmp = ac_build_alu_op(ctx, result, src, op);
4251                 tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
4252                 tmp = ac_build_alu_op(ctx, result, tmp, op);
4253                 cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
4254                                    LLVMConstInt(ctx->i32, 32, false), "");
4255                 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4256                 return result;
4257         }
4258
4259         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4260         result = ac_build_alu_op(ctx, result, tmp, op);
4261         if (maxprefix <= 32)
4262                 return result;
4263         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4264         result = ac_build_alu_op(ctx, result, tmp, op);
4265         return result;
4266 }
4267
4268 LLVMValueRef
4269 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4270 {
4271         LLVMValueRef result;
4272
4273         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4274                 LLVMBuilderRef builder = ctx->builder;
4275                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4276                 result = ac_build_ballot(ctx, src);
4277                 result = ac_build_mbcnt(ctx, result);
4278                 result = LLVMBuildAdd(builder, result, src, "");
4279                 return result;
4280         }
4281
4282         ac_build_optimization_barrier(ctx, &src);
4283
4284         LLVMValueRef identity =
4285                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4286         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4287                                   LLVMTypeOf(identity), "");
4288         result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4289
4290         return ac_build_wwm(ctx, result);
4291 }
4292
4293 LLVMValueRef
4294 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4295 {
4296         LLVMValueRef result;
4297
4298         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4299                 LLVMBuilderRef builder = ctx->builder;
4300                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4301                 result = ac_build_ballot(ctx, src);
4302                 result = ac_build_mbcnt(ctx, result);
4303                 return result;
4304         }
4305
4306         ac_build_optimization_barrier(ctx, &src);
4307
4308         LLVMValueRef identity =
4309                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4310         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4311                                   LLVMTypeOf(identity), "");
4312         result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4313
4314         return ac_build_wwm(ctx, result);
4315 }
4316
4317 LLVMValueRef
4318 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
4319 {
4320         if (cluster_size == 1) return src;
4321         ac_build_optimization_barrier(ctx, &src);
4322         LLVMValueRef result, swap;
4323         LLVMValueRef identity = get_reduction_identity(ctx, op,
4324                                                                 ac_get_type_size(LLVMTypeOf(src)));
4325         result = LLVMBuildBitCast(ctx->builder,
4326                                                                 ac_build_set_inactive(ctx, src, identity),
4327                                                                 LLVMTypeOf(identity), "");
4328         swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4329         result = ac_build_alu_op(ctx, result, swap, op);
4330         if (cluster_size == 2) return ac_build_wwm(ctx, result);
4331
4332         swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4333         result = ac_build_alu_op(ctx, result, swap, op);
4334         if (cluster_size == 4) return ac_build_wwm(ctx, result);
4335
4336         if (ctx->chip_class >= GFX8)
4337                 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4338         else
4339                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4340         result = ac_build_alu_op(ctx, result, swap, op);
4341         if (cluster_size == 8) return ac_build_wwm(ctx, result);
4342
4343         if (ctx->chip_class >= GFX8)
4344                 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4345         else
4346                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4347         result = ac_build_alu_op(ctx, result, swap, op);
4348         if (cluster_size == 16) return ac_build_wwm(ctx, result);
4349
4350         if (ctx->chip_class >= GFX10)
4351                 swap = ac_build_permlane16(ctx, result, 0, true, false);
4352         else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4353                 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4354         else
4355                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4356         result = ac_build_alu_op(ctx, result, swap, op);
4357         if (cluster_size == 32) return ac_build_wwm(ctx, result);
4358
4359         if (ctx->chip_class >= GFX8) {
4360                 if (ctx->chip_class >= GFX10)
4361                         swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4362                 else
4363                         swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4364                 result = ac_build_alu_op(ctx, result, swap, op);
4365                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4366                 return ac_build_wwm(ctx, result);
4367         } else {
4368                 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4369                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4370                 result = ac_build_alu_op(ctx, result, swap, op);
4371                 return ac_build_wwm(ctx, result);
4372         }
4373 }
4374
4375 /**
4376  * "Top half" of a scan that reduces per-wave values across an entire
4377  * workgroup.
4378  *
4379  * The source value must be present in the highest lane of the wave, and the
4380  * highest lane must be live.
4381  */
4382 void
4383 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4384 {
4385         if (ws->maxwaves <= 1)
4386                 return;
4387
4388         const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4389         LLVMBuilderRef builder = ctx->builder;
4390         LLVMValueRef tid = ac_get_thread_id(ctx);
4391         LLVMValueRef tmp;
4392
4393         tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4394         ac_build_ifcc(ctx, tmp, 1000);
4395         LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4396         ac_build_endif(ctx, 1000);
4397 }
4398
4399 /**
4400  * "Bottom half" of a scan that reduces per-wave values across an entire
4401  * workgroup.
4402  *
4403  * The caller must place a barrier between the top and bottom halves.
4404  */
4405 void
4406 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4407 {
4408         const LLVMTypeRef type = LLVMTypeOf(ws->src);
4409         const LLVMValueRef identity =
4410                 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4411
4412         if (ws->maxwaves <= 1) {
4413                 ws->result_reduce = ws->src;
4414                 ws->result_inclusive = ws->src;
4415                 ws->result_exclusive = identity;
4416                 return;
4417         }
4418         assert(ws->maxwaves <= 32);
4419
4420         LLVMBuilderRef builder = ctx->builder;
4421         LLVMValueRef tid = ac_get_thread_id(ctx);
4422         LLVMBasicBlockRef bbs[2];
4423         LLVMValueRef phivalues_scan[2];
4424         LLVMValueRef tmp, tmp2;
4425
4426         bbs[0] = LLVMGetInsertBlock(builder);
4427         phivalues_scan[0] = LLVMGetUndef(type);
4428
4429         if (ws->enable_reduce)
4430                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4431         else if (ws->enable_inclusive)
4432                 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4433         else
4434                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4435         ac_build_ifcc(ctx, tmp, 1001);
4436         {
4437                 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4438
4439                 ac_build_optimization_barrier(ctx, &tmp);
4440
4441                 bbs[1] = LLVMGetInsertBlock(builder);
4442                 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4443         }
4444         ac_build_endif(ctx, 1001);
4445
4446         const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4447
4448         if (ws->enable_reduce) {
4449                 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4450                 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4451         }
4452         if (ws->enable_inclusive)
4453                 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4454         if (ws->enable_exclusive) {
4455                 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4456                 tmp = ac_build_readlane(ctx, scan, tmp);
4457                 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4458                 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4459         }
4460 }
4461
4462 /**
4463  * Inclusive scan of a per-wave value across an entire workgroup.
4464  *
4465  * This implies an s_barrier instruction.
4466  *
4467  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4468  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4469  * useful manner because of the barrier in the algorithm.)
4470  */
4471 void
4472 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4473 {
4474         ac_build_wg_wavescan_top(ctx, ws);
4475         ac_build_s_barrier(ctx);
4476         ac_build_wg_wavescan_bottom(ctx, ws);
4477 }
4478
4479 /**
4480  * "Top half" of a scan that reduces per-thread values across an entire
4481  * workgroup.
4482  *
4483  * All lanes must be active when this code runs.
4484  */
4485 void
4486 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4487 {
4488         if (ws->enable_exclusive) {
4489                 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4490                 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4491                         ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4492                 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4493         } else {
4494                 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4495         }
4496
4497         bool enable_inclusive = ws->enable_inclusive;
4498         bool enable_exclusive = ws->enable_exclusive;
4499         ws->enable_inclusive = false;
4500         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4501         ac_build_wg_wavescan_top(ctx, ws);
4502         ws->enable_inclusive = enable_inclusive;
4503         ws->enable_exclusive = enable_exclusive;
4504 }
4505
4506 /**
4507  * "Bottom half" of a scan that reduces per-thread values across an entire
4508  * workgroup.
4509  *
4510  * The caller must place a barrier between the top and bottom halves.
4511  */
4512 void
4513 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4514 {
4515         bool enable_inclusive = ws->enable_inclusive;
4516         bool enable_exclusive = ws->enable_exclusive;
4517         ws->enable_inclusive = false;
4518         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4519         ac_build_wg_wavescan_bottom(ctx, ws);
4520         ws->enable_inclusive = enable_inclusive;
4521         ws->enable_exclusive = enable_exclusive;
4522
4523         /* ws->result_reduce is already the correct value */
4524         if (ws->enable_inclusive)
4525                 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4526         if (ws->enable_exclusive)
4527                 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4528 }
4529
4530 /**
4531  * A scan that reduces per-thread values across an entire workgroup.
4532  *
4533  * The caller must ensure that all lanes are active when this code runs
4534  * (WWM is insufficient!), because there is an implied barrier.
4535  */
4536 void
4537 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4538 {
4539         ac_build_wg_scan_top(ctx, ws);
4540         ac_build_s_barrier(ctx);
4541         ac_build_wg_scan_bottom(ctx, ws);
4542 }
4543
4544 LLVMValueRef
4545 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
4546                 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
4547 {
4548         unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4549         if (ctx->chip_class >= GFX8) {
4550                 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4551         } else {
4552                 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4553         }
4554 }
4555
4556 LLVMValueRef
4557 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4558 {
4559         index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4560         return ac_build_intrinsic(ctx,
4561                   "llvm.amdgcn.ds.bpermute", ctx->i32,
4562                   (LLVMValueRef []) {index, src}, 2,
4563                   AC_FUNC_ATTR_READNONE |
4564                   AC_FUNC_ATTR_CONVERGENT);
4565 }
4566
4567 LLVMValueRef
4568 ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
4569                    unsigned bitsize)
4570 {
4571         LLVMTypeRef type;
4572         char *intr;
4573
4574         if (bitsize == 16) {
4575                 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4576                 type = ctx->i16;
4577         } else if (bitsize == 32) {
4578                 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4579                 type = ctx->i32;
4580         } else {
4581                 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4582                 type = ctx->i32;
4583         }
4584
4585         LLVMValueRef params[] = {
4586                 src0,
4587         };
4588         return ac_build_intrinsic(ctx, intr, type, params, 1,
4589                                   AC_FUNC_ATTR_READNONE);
4590 }
4591 LLVMValueRef
4592 ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
4593                     unsigned bitsize)
4594 {
4595         LLVMTypeRef type;
4596         char *intr;
4597
4598         if (bitsize == 16) {
4599                 intr = "llvm.amdgcn.frexp.mant.f16";
4600                 type = ctx->f16;
4601         } else if (bitsize == 32) {
4602                 intr = "llvm.amdgcn.frexp.mant.f32";
4603                 type = ctx->f32;
4604         } else {
4605                 intr = "llvm.amdgcn.frexp.mant.f64";
4606                 type = ctx->f64;
4607         }
4608
4609         LLVMValueRef params[] = {
4610                 src0,
4611         };
4612         return ac_build_intrinsic(ctx, intr, type, params, 1,
4613                                   AC_FUNC_ATTR_READNONE);
4614 }
4615
4616 /*
4617  * this takes an I,J coordinate pair,
4618  * and works out the X and Y derivatives.
4619  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4620  */
4621 LLVMValueRef
4622 ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4623 {
4624         LLVMValueRef result[4], a;
4625         unsigned i;
4626
4627         for (i = 0; i < 2; i++) {
4628                 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
4629                                             LLVMConstInt(ctx->i32, i, false), "");
4630                 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4631                 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4632         }
4633         return ac_build_gather_values(ctx, result, 4);
4634 }
4635
4636 LLVMValueRef
4637 ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4638 {
4639         LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
4640                                                  ctx->i1, NULL, 0,
4641                                                  AC_FUNC_ATTR_READNONE);
4642         result = LLVMBuildNot(ctx->builder, result, "");
4643         return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4644 }
4645
4646 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
4647                            LLVMValueRef *args, unsigned num_args)
4648 {
4649         LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4650         LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4651         return ret;
4652 }