src/amd/common/ac_llvm_build.c

   1 /*
   2  * Copyright 2014 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sub license, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  19  *
  20  * The above copyright notice and this permission notice (including the
  21  * next paragraph) shall be included in all copies or substantial portions
  22  * of the Software.
  23  *
  24  */
  25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
  26 #include "ac_llvm_build.h"
  27
  28 #include <llvm-c/Core.h>
  29
  30 #include "c11/threads.h"
  31
  32 #include <assert.h>
  33 #include <stdio.h>
  34
  35 #include "ac_llvm_util.h"
  36 #include "ac_exp_param.h"
  37 #include "util/bitscan.h"
  38 #include "util/macros.h"
  39 #include "util/u_atomic.h"
  40 #include "util/u_math.h"
  41 #include "sid.h"
  42
  43 #include "shader_enums.h"
  44
  45 #define AC_LLVM_INITIAL_CF_DEPTH 4
  46
  47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
  48  */
  49 struct ac_llvm_flow {
  50         /* Loop exit or next part of if/else/endif. */
  51         LLVMBasicBlockRef next_block;
  52         LLVMBasicBlockRef loop_entry_block;
  53 };
  54
  55 /* Initialize module-independent parts of the context.
  56  *
  57  * The caller is responsible for initializing ctx::module and ctx::builder.
  58  */
  59 void
  60 ac_llvm_context_init(struct ac_llvm_context *ctx,
  61                      struct ac_llvm_compiler *compiler,
  62                      enum chip_class chip_class, enum radeon_family family,
  63                      enum ac_float_mode float_mode, unsigned wave_size)
  64 {
  65         LLVMValueRef args[1];
  66
  67         ctx->context = LLVMContextCreate();
  68
  69         ctx->chip_class = chip_class;
  70         ctx->family = family;
  71         ctx->wave_size = wave_size;
  72         ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32
  73                                                        : compiler->tm,
  74                                        ctx->context);
  75         ctx->builder = ac_create_builder(ctx->context, float_mode);
  76
  77         ctx->voidt = LLVMVoidTypeInContext(ctx->context);
  78         ctx->i1 = LLVMInt1TypeInContext(ctx->context);
  79         ctx->i8 = LLVMInt8TypeInContext(ctx->context);
  80         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
  81         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
  82         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
  83         ctx->intptr = ctx->i32;
  84         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
  85         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
  86         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
  87         ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
  88         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
  89         ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
  90         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
  91         ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
  92         ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
  93         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  94         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  95         ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
  96
  97         ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
  98         ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
  99         ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
 100         ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
 101         ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
 102         ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
 103         ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
 104         ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
 105         ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
 106         ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
 107         ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
 108         ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 109         ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
 110         ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
 111
 112         ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
 113         ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
 114
 115         ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 116                                                      "range", 5);
 117
 118         ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 119                                                                "invariant.load", 14);
 120
 121         ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
 122
 123         args[0] = LLVMConstReal(ctx->f32, 2.5);
 124         ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
 125
 126         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 127                                                         "amdgpu.uniform", 14);
 128
 129         ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
 130         ctx->flow = calloc(1, sizeof(*ctx->flow));
 131 }
 132
 133 void
 134 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
 135 {
 136         free(ctx->flow->stack);
 137         free(ctx->flow);
 138         ctx->flow = NULL;
 139 }
 140
 141 int
 142 ac_get_llvm_num_components(LLVMValueRef value)
 143 {
 144         LLVMTypeRef type = LLVMTypeOf(value);
 145         unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
 146                                       ? LLVMGetVectorSize(type)
 147                                       : 1;
 148         return num_components;
 149 }
 150
 151 LLVMValueRef
 152 ac_llvm_extract_elem(struct ac_llvm_context *ac,
 153                      LLVMValueRef value,
 154                      int index)
 155 {
 156         if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
 157                 assert(index == 0);
 158                 return value;
 159         }
 160
 161         return LLVMBuildExtractElement(ac->builder, value,
 162                                        LLVMConstInt(ac->i32, index, false), "");
 163 }
 164
 165 int
 166 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
 167 {
 168         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 169                 type = LLVMGetElementType(type);
 170
 171         if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
 172                 return LLVMGetIntTypeWidth(type);
 173
 174         if (type == ctx->f16)
 175                 return 16;
 176         if (type == ctx->f32)
 177                 return 32;
 178         if (type == ctx->f64)
 179                 return 64;
 180
 181         unreachable("Unhandled type kind in get_elem_bits");
 182 }
 183
 184 unsigned
 185 ac_get_type_size(LLVMTypeRef type)
 186 {
 187         LLVMTypeKind kind = LLVMGetTypeKind(type);
 188
 189         switch (kind) {
 190         case LLVMIntegerTypeKind:
 191                 return LLVMGetIntTypeWidth(type) / 8;
 192         case LLVMHalfTypeKind:
 193                 return 2;
 194         case LLVMFloatTypeKind:
 195                 return 4;
 196         case LLVMDoubleTypeKind:
 197                 return 8;
 198         case LLVMPointerTypeKind:
 199                 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
 200                         return 4;
 201                 return 8;
 202         case LLVMVectorTypeKind:
 203                 return LLVMGetVectorSize(type) *
 204                        ac_get_type_size(LLVMGetElementType(type));
 205         case LLVMArrayTypeKind:
 206                 return LLVMGetArrayLength(type) *
 207                        ac_get_type_size(LLVMGetElementType(type));
 208         default:
 209                 assert(0);
 210                 return 0;
 211         }
 212 }
 213
 214 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 215 {
 216         if (t == ctx->i8)
 217                 return ctx->i8;
 218         else if (t == ctx->f16 || t == ctx->i16)
 219                 return ctx->i16;
 220         else if (t == ctx->f32 || t == ctx->i32)
 221                 return ctx->i32;
 222         else if (t == ctx->f64 || t == ctx->i64)
 223                 return ctx->i64;
 224         else
 225                 unreachable("Unhandled integer size");
 226 }
 227
 228 LLVMTypeRef
 229 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 230 {
 231         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 232                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 233                 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
 234                                       LLVMGetVectorSize(t));
 235         }
 236         if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
 237                 switch (LLVMGetPointerAddressSpace(t)) {
 238                 case AC_ADDR_SPACE_GLOBAL:
 239                         return ctx->i64;
 240                 case AC_ADDR_SPACE_LDS:
 241                         return ctx->i32;
 242                 default:
 243                         unreachable("unhandled address space");
 244                 }
 245         }
 246         return to_integer_type_scalar(ctx, t);
 247 }
 248
 249 LLVMValueRef
 250 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 251 {
 252         LLVMTypeRef type = LLVMTypeOf(v);
 253         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
 254                 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 255         }
 256         return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 257 }
 258
 259 LLVMValueRef
 260 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
 261 {
 262         LLVMTypeRef type = LLVMTypeOf(v);
 263         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
 264                 return v;
 265         return ac_to_integer(ctx, v);
 266 }
 267
 268 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 269 {
 270         if (t == ctx->i8)
 271                 return ctx->i8;
 272         else if (t == ctx->i16 || t == ctx->f16)
 273                 return ctx->f16;
 274         else if (t == ctx->i32 || t == ctx->f32)
 275                 return ctx->f32;
 276         else if (t == ctx->i64 || t == ctx->f64)
 277                 return ctx->f64;
 278         else
 279                 unreachable("Unhandled float size");
 280 }
 281
 282 LLVMTypeRef
 283 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 284 {
 285         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 286                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 287                 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
 288                                       LLVMGetVectorSize(t));
 289         }
 290         return to_float_type_scalar(ctx, t);
 291 }
 292
 293 LLVMValueRef
 294 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
 295 {
 296         LLVMTypeRef type = LLVMTypeOf(v);
 297         return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
 298 }
 299
 300
 301 LLVMValueRef
 302 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
 303                    LLVMTypeRef return_type, LLVMValueRef *params,
 304                    unsigned param_count, unsigned attrib_mask)
 305 {
 306         LLVMValueRef function, call;
 307         bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 308
 309         function = LLVMGetNamedFunction(ctx->module, name);
 310         if (!function) {
 311                 LLVMTypeRef param_types[32], function_type;
 312                 unsigned i;
 313
 314                 assert(param_count <= 32);
 315
 316                 for (i = 0; i < param_count; ++i) {
 317                         assert(params[i]);
 318                         param_types[i] = LLVMTypeOf(params[i]);
 319                 }
 320                 function_type =
 321                     LLVMFunctionType(return_type, param_types, param_count, 0);
 322                 function = LLVMAddFunction(ctx->module, name, function_type);
 323
 324                 LLVMSetFunctionCallConv(function, LLVMCCallConv);
 325                 LLVMSetLinkage(function, LLVMExternalLinkage);
 326
 327                 if (!set_callsite_attrs)
 328                         ac_add_func_attributes(ctx->context, function, attrib_mask);
 329         }
 330
 331         call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
 332         if (set_callsite_attrs)
 333                 ac_add_func_attributes(ctx->context, call, attrib_mask);
 334         return call;
 335 }
 336
 337 /**
 338  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
 339  * intrinsic names).
 340  */
 341 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
 342 {
 343         LLVMTypeRef elem_type = type;
 344
 345         assert(bufsize >= 8);
 346
 347         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
 348                 int ret = snprintf(buf, bufsize, "v%u",
 349                                         LLVMGetVectorSize(type));
 350                 if (ret < 0) {
 351                         char *type_name = LLVMPrintTypeToString(type);
 352                         fprintf(stderr, "Error building type name for: %s\n",
 353                                 type_name);
 354                         LLVMDisposeMessage(type_name);
 355                         return;
 356                 }
 357                 elem_type = LLVMGetElementType(type);
 358                 buf += ret;
 359                 bufsize -= ret;
 360         }
 361         switch (LLVMGetTypeKind(elem_type)) {
 362         default: break;
 363         case LLVMIntegerTypeKind:
 364                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 365                 break;
 366         case LLVMHalfTypeKind:
 367                 snprintf(buf, bufsize, "f16");
 368                 break;
 369         case LLVMFloatTypeKind:
 370                 snprintf(buf, bufsize, "f32");
 371                 break;
 372         case LLVMDoubleTypeKind:
 373                 snprintf(buf, bufsize, "f64");
 374                 break;
 375         }
 376 }
 377
 378 /**
 379  * Helper function that builds an LLVM IR PHI node and immediately adds
 380  * incoming edges.
 381  */
 382 LLVMValueRef
 383 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
 384              unsigned count_incoming, LLVMValueRef *values,
 385              LLVMBasicBlockRef *blocks)
 386 {
 387         LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
 388         LLVMAddIncoming(phi, values, blocks, count_incoming);
 389         return phi;
 390 }
 391
 392 void ac_build_s_barrier(struct ac_llvm_context *ctx)
 393 {
 394         ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
 395                            0, AC_FUNC_ATTR_CONVERGENT);
 396 }
 397
 398 /* Prevent optimizations (at least of memory accesses) across the current
 399  * point in the program by emitting empty inline assembly that is marked as
 400  * having side effects.
 401  *
 402  * Optionally, a value can be passed through the inline assembly to prevent
 403  * LLVM from hoisting calls to ReadNone functions.
 404  */
 405 void
 406 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
 407                               LLVMValueRef *pvgpr)
 408 {
 409         static int counter = 0;
 410
 411         LLVMBuilderRef builder = ctx->builder;
 412         char code[16];
 413
 414         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
 415
 416         if (!pvgpr) {
 417                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
 418                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
 419                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
 420         } else {
 421                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
 422                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
 423                 LLVMValueRef vgpr = *pvgpr;
 424                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
 425                 unsigned vgpr_size = ac_get_type_size(vgpr_type);
 426                 LLVMValueRef vgpr0;
 427
 428                 assert(vgpr_size % 4 == 0);
 429
 430                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
 431                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
 432                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
 433                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
 434                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
 435
 436                 *pvgpr = vgpr;
 437         }
 438 }
 439
 440 LLVMValueRef
 441 ac_build_shader_clock(struct ac_llvm_context *ctx)
 442 {
 443         const char *intr = HAVE_LLVM >= 0x0900 && ctx->chip_class >= GFX8 ?
 444                                 "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter";
 445         LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0);
 446         return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
 447 }
 448
 449 LLVMValueRef
 450 ac_build_ballot(struct ac_llvm_context *ctx,
 451                 LLVMValueRef value)
 452 {
 453         const char *name;
 454
 455         if (HAVE_LLVM >= 0x900) {
 456                 if (ctx->wave_size == 64)
 457                         name = "llvm.amdgcn.icmp.i64.i32";
 458                 else
 459                         name = "llvm.amdgcn.icmp.i32.i32";
 460         } else {
 461                 name = "llvm.amdgcn.icmp.i32";
 462         }
 463         LLVMValueRef args[3] = {
 464                 value,
 465                 ctx->i32_0,
 466                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
 467         };
 468
 469         /* We currently have no other way to prevent LLVM from lifting the icmp
 470          * calls to a dominating basic block.
 471          */
 472         ac_build_optimization_barrier(ctx, &args[0]);
 473
 474         args[0] = ac_to_integer(ctx, args[0]);
 475
 476         return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3,
 477                                   AC_FUNC_ATTR_NOUNWIND |
 478                                   AC_FUNC_ATTR_READNONE |
 479                                   AC_FUNC_ATTR_CONVERGENT);
 480 }
 481
 482 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
 483                                  LLVMValueRef value)
 484 {
 485         const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
 486         LLVMValueRef args[3] = {
 487                 value,
 488                 ctx->i1false,
 489                 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
 490         };
 491
 492         assert(HAVE_LLVM >= 0x0800);
 493         return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
 494                                   AC_FUNC_ATTR_NOUNWIND |
 495                                   AC_FUNC_ATTR_READNONE |
 496                                   AC_FUNC_ATTR_CONVERGENT);
 497 }
 498
 499 LLVMValueRef
 500 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
 501 {
 502         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 503         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 504         return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
 505 }
 506
 507 LLVMValueRef
 508 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
 509 {
 510         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 511         return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
 512                              LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
 513 }
 514
 515 LLVMValueRef
 516 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
 517 {
 518         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 519         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 520
 521         LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 522                                          vote_set, active_set, "");
 523         LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 524                                           vote_set,
 525                                           LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
 526         return LLVMBuildOr(ctx->builder, all, none, "");
 527 }
 528
 529 LLVMValueRef
 530 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
 531                                unsigned value_count, unsigned component)
 532 {
 533         LLVMValueRef vec = NULL;
 534
 535         if (value_count == 1) {
 536                 return values[component];
 537         } else if (!value_count)
 538                 unreachable("value_count is 0");
 539
 540         for (unsigned i = component; i < value_count + component; i++) {
 541                 LLVMValueRef value = values[i];
 542
 543                 if (i == component)
 544                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 545                 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
 546                 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
 547         }
 548         return vec;
 549 }
 550
 551 LLVMValueRef
 552 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 553                                 LLVMValueRef *values,
 554                                 unsigned value_count,
 555                                 unsigned value_stride,
 556                                 bool load,
 557                                 bool always_vector)
 558 {
 559         LLVMBuilderRef builder = ctx->builder;
 560         LLVMValueRef vec = NULL;
 561         unsigned i;
 562
 563         if (value_count == 1 && !always_vector) {
 564                 if (load)
 565                         return LLVMBuildLoad(builder, values[0], "");
 566                 return values[0];
 567         } else if (!value_count)
 568                 unreachable("value_count is 0");
 569
 570         for (i = 0; i < value_count; i++) {
 571                 LLVMValueRef value = values[i * value_stride];
 572                 if (load)
 573                         value = LLVMBuildLoad(builder, value, "");
 574
 575                 if (!i)
 576                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 577                 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
 578                 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
 579         }
 580         return vec;
 581 }
 582
 583 LLVMValueRef
 584 ac_build_gather_values(struct ac_llvm_context *ctx,
 585                        LLVMValueRef *values,
 586                        unsigned value_count)
 587 {
 588         return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 589 }
 590
 591 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
 592  * channels with undef. Extract at most src_channels components from the input.
 593  */
 594 static LLVMValueRef
 595 ac_build_expand(struct ac_llvm_context *ctx,
 596                 LLVMValueRef value,
 597                 unsigned src_channels,
 598                 unsigned dst_channels)
 599 {
 600         LLVMTypeRef elemtype;
 601         LLVMValueRef chan[dst_channels];
 602
 603         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
 604                 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
 605
 606                 if (src_channels == dst_channels && vec_size == dst_channels)
 607                         return value;
 608
 609                 src_channels = MIN2(src_channels, vec_size);
 610
 611                 for (unsigned i = 0; i < src_channels; i++)
 612                         chan[i] = ac_llvm_extract_elem(ctx, value, i);
 613
 614                 elemtype = LLVMGetElementType(LLVMTypeOf(value));
 615         } else {
 616                 if (src_channels) {
 617                         assert(src_channels == 1);
 618                         chan[0] = value;
 619                 }
 620                 elemtype = LLVMTypeOf(value);
 621         }
 622
 623         for (unsigned i = src_channels; i < dst_channels; i++)
 624                 chan[i] = LLVMGetUndef(elemtype);
 625
 626         return ac_build_gather_values(ctx, chan, dst_channels);
 627 }
 628
 629 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
 630  * with undef. Extract at most num_channels components from the input.
 631  */
 632 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
 633                                      LLVMValueRef value,
 634                                      unsigned num_channels)
 635 {
 636         return ac_build_expand(ctx, value, num_channels, 4);
 637 }
 638
 639 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
 640 {
 641         unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
 642         const char *name;
 643
 644         if (type_size == 2)
 645                 name = "llvm.rint.f16";
 646         else if (type_size == 4)
 647                 name = "llvm.rint.f32";
 648         else
 649                 name = "llvm.rint.f64";
 650
 651         return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
 652                                   AC_FUNC_ATTR_READNONE);
 653 }
 654
 655 LLVMValueRef
 656 ac_build_fdiv(struct ac_llvm_context *ctx,
 657               LLVMValueRef num,
 658               LLVMValueRef den)
 659 {
 660         /* If we do (num / den), LLVM >= 7.0 does:
 661          *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
 662          *
 663          * If we do (num * (1 / den)), LLVM does:
 664          *    return num * v_rcp_f32(den);
 665          */
 666         LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
 667         LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
 668         LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 669
 670         /* Use v_rcp_f32 instead of precise division. */
 671         if (!LLVMIsConstant(ret))
 672                 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
 673         return ret;
 674 }
 675
 676 /* See fast_idiv_by_const.h. */
 677 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
 678 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
 679                                 LLVMValueRef num,
 680                                 LLVMValueRef multiplier,
 681                                 LLVMValueRef pre_shift,
 682                                 LLVMValueRef post_shift,
 683                                 LLVMValueRef increment)
 684 {
 685         LLVMBuilderRef builder = ctx->builder;
 686
 687         num = LLVMBuildLShr(builder, num, pre_shift, "");
 688         num = LLVMBuildMul(builder,
 689                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 690                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 691         num = LLVMBuildAdd(builder, num,
 692                            LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
 693         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 694         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 695         return LLVMBuildLShr(builder, num, post_shift, "");
 696 }
 697
 698 /* See fast_idiv_by_const.h. */
 699 /* If num != UINT_MAX, this more efficient version can be used. */
 700 /* Set: increment = util_fast_udiv_info::increment; */
 701 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
 702                                     LLVMValueRef num,
 703                                     LLVMValueRef multiplier,
 704                                     LLVMValueRef pre_shift,
 705                                     LLVMValueRef post_shift,
 706                                     LLVMValueRef increment)
 707 {
 708         LLVMBuilderRef builder = ctx->builder;
 709
 710         num = LLVMBuildLShr(builder, num, pre_shift, "");
 711         num = LLVMBuildNUWAdd(builder, num, increment, "");
 712         num = LLVMBuildMul(builder,
 713                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 714                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 715         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 716         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 717         return LLVMBuildLShr(builder, num, post_shift, "");
 718 }
 719
 720 /* See fast_idiv_by_const.h. */
 721 /* Both operands must fit in 31 bits and the divisor must not be 1. */
 722 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
 723                                               LLVMValueRef num,
 724                                               LLVMValueRef multiplier,
 725                                               LLVMValueRef post_shift)
 726 {
 727         LLVMBuilderRef builder = ctx->builder;
 728
 729         num = LLVMBuildMul(builder,
 730                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 731                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 732         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 733         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 734         return LLVMBuildLShr(builder, num, post_shift, "");
 735 }
 736
 737 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
 738  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
 739  * already multiplied by two. id is the cube face number.
 740  */
 741 struct cube_selection_coords {
 742         LLVMValueRef stc[2];
 743         LLVMValueRef ma;
 744         LLVMValueRef id;
 745 };
 746
 747 static void
 748 build_cube_intrinsic(struct ac_llvm_context *ctx,
 749                      LLVMValueRef in[3],
 750                      struct cube_selection_coords *out)
 751 {
 752         LLVMTypeRef f32 = ctx->f32;
 753
 754         out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
 755                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 756         out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
 757                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 758         out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
 759                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 760         out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
 761                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 762 }
 763
 764 /**
 765  * Build a manual selection sequence for cube face sc/tc coordinates and
 766  * major axis vector (multiplied by 2 for consistency) for the given
 767  * vec3 \p coords, for the face implied by \p selcoords.
 768  *
 769  * For the major axis, we always adjust the sign to be in the direction of
 770  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
 771  * the selcoords major axis.
 772  */
 773 static void build_cube_select(struct ac_llvm_context *ctx,
 774                               const struct cube_selection_coords *selcoords,
 775                               const LLVMValueRef *coords,
 776                               LLVMValueRef *out_st,
 777                               LLVMValueRef *out_ma)
 778 {
 779         LLVMBuilderRef builder = ctx->builder;
 780         LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
 781         LLVMValueRef is_ma_positive;
 782         LLVMValueRef sgn_ma;
 783         LLVMValueRef is_ma_z, is_not_ma_z;
 784         LLVMValueRef is_ma_y;
 785         LLVMValueRef is_ma_x;
 786         LLVMValueRef sgn;
 787         LLVMValueRef tmp;
 788
 789         is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
 790                 selcoords->ma, LLVMConstReal(f32, 0.0), "");
 791         sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
 792                 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
 793
 794         is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
 795         is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
 796         is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
 797                 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
 798         is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 799
 800         /* Select sc */
 801         tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
 802         sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
 803                 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
 804                         LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 805         out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 806
 807         /* Select tc */
 808         tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
 809         sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
 810                 LLVMConstReal(f32, -1.0), "");
 811         out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 812
 813         /* Select ma */
 814         tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
 815                 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
 816         tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 817                                  ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
 818         *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
 819 }
 820
 821 void
 822 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
 823                        bool is_deriv, bool is_array, bool is_lod,
 824                        LLVMValueRef *coords_arg,
 825                        LLVMValueRef *derivs_arg)
 826 {
 827
 828         LLVMBuilderRef builder = ctx->builder;
 829         struct cube_selection_coords selcoords;
 830         LLVMValueRef coords[3];
 831         LLVMValueRef invma;
 832
 833         if (is_array && !is_lod) {
 834                 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
 835
 836                 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
 837                  *
 838                  *    "For Array forms, the array layer used will be
 839                  *
 840                  *       max(0, min(d−1, floor(layer+0.5)))
 841                  *
 842                  *     where d is the depth of the texture array and layer
 843                  *     comes from the component indicated in the tables below.
 844                  *     Workaroudn for an issue where the layer is taken from a
 845                  *     helper invocation which happens to fall on a different
 846                  *     layer due to extrapolation."
 847                  *
 848                  * GFX8 and earlier attempt to implement this in hardware by
 849                  * clamping the value of coords[2] = (8 * layer) + face.
 850                  * Unfortunately, this means that the we end up with the wrong
 851                  * face when clamping occurs.
 852                  *
 853                  * Clamp the layer earlier to work around the issue.
 854                  */
 855                 if (ctx->chip_class <= GFX8) {
 856                         LLVMValueRef ge0;
 857                         ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
 858                         tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
 859                 }
 860
 861                 coords_arg[3] = tmp;
 862         }
 863
 864         build_cube_intrinsic(ctx, coords_arg, &selcoords);
 865
 866         invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 867                         ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
 868         invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
 869
 870         for (int i = 0; i < 2; ++i)
 871                 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
 872
 873         coords[2] = selcoords.id;
 874
 875         if (is_deriv && derivs_arg) {
 876                 LLVMValueRef derivs[4];
 877                 int axis;
 878
 879                 /* Convert cube derivatives to 2D derivatives. */
 880                 for (axis = 0; axis < 2; axis++) {
 881                         LLVMValueRef deriv_st[2];
 882                         LLVMValueRef deriv_ma;
 883
 884                         /* Transform the derivative alongside the texture
 885                          * coordinate. Mathematically, the correct formula is
 886                          * as follows. Assume we're projecting onto the +Z face
 887                          * and denote by dx/dh the derivative of the (original)
 888                          * X texture coordinate with respect to horizontal
 889                          * window coordinates. The projection onto the +Z face
 890                          * plane is:
 891                          *
 892                          *   f(x,z) = x/z
 893                          *
 894                          * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
 895                          *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
 896                          *
 897                          * This motivatives the implementation below.
 898                          *
 899                          * Whether this actually gives the expected results for
 900                          * apps that might feed in derivatives obtained via
 901                          * finite differences is anyone's guess. The OpenGL spec
 902                          * seems awfully quiet about how textureGrad for cube
 903                          * maps should be handled.
 904                          */
 905                         build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
 906                                           deriv_st, &deriv_ma);
 907
 908                         deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
 909
 910                         for (int i = 0; i < 2; ++i)
 911                                 derivs[axis * 2 + i] =
 912                                         LLVMBuildFSub(builder,
 913                                                 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
 914                                                 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
 915                 }
 916
 917                 memcpy(derivs_arg, derivs, sizeof(derivs));
 918         }
 919
 920         /* Shift the texture coordinate. This must be applied after the
 921          * derivative calculation.
 922          */
 923         for (int i = 0; i < 2; ++i)
 924                 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
 925
 926         if (is_array) {
 927                 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 928                 /* coords_arg.w component - array_index for cube arrays */
 929                 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
 930         }
 931
 932         memcpy(coords_arg, coords, sizeof(coords));
 933 }
 934
 935
 936 LLVMValueRef
 937 ac_build_fs_interp(struct ac_llvm_context *ctx,
 938                    LLVMValueRef llvm_chan,
 939                    LLVMValueRef attr_number,
 940                    LLVMValueRef params,
 941                    LLVMValueRef i,
 942                    LLVMValueRef j)
 943 {
 944         LLVMValueRef args[5];
 945         LLVMValueRef p1;
 946
 947         args[0] = i;
 948         args[1] = llvm_chan;
 949         args[2] = attr_number;
 950         args[3] = params;
 951
 952         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
 953                                 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 954
 955         args[0] = p1;
 956         args[1] = j;
 957         args[2] = llvm_chan;
 958         args[3] = attr_number;
 959         args[4] = params;
 960
 961         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
 962                                   ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 963 }
 964
 965 LLVMValueRef
 966 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
 967                        LLVMValueRef llvm_chan,
 968                        LLVMValueRef attr_number,
 969                        LLVMValueRef params,
 970                        LLVMValueRef i,
 971                        LLVMValueRef j)
 972 {
 973         LLVMValueRef args[6];
 974         LLVMValueRef p1;
 975
 976         args[0] = i;
 977         args[1] = llvm_chan;
 978         args[2] = attr_number;
 979         args[3] = ctx->i1false;
 980         args[4] = params;
 981
 982         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
 983                                 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 984
 985         args[0] = p1;
 986         args[1] = j;
 987         args[2] = llvm_chan;
 988         args[3] = attr_number;
 989         args[4] = ctx->i1false;
 990         args[5] = params;
 991
 992         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
 993                                   ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
 994 }
 995
 996 LLVMValueRef
 997 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 998                        LLVMValueRef parameter,
 999                        LLVMValueRef llvm_chan,
1000                        LLVMValueRef attr_number,
1001                        LLVMValueRef params)
1002 {
1003         LLVMValueRef args[4];
1004
1005         args[0] = parameter;
1006         args[1] = llvm_chan;
1007         args[2] = attr_number;
1008         args[3] = params;
1009
1010         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
1011                                   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1012 }
1013
1014 LLVMValueRef
1015 ac_build_gep_ptr(struct ac_llvm_context *ctx,
1016                  LLVMValueRef base_ptr,
1017                  LLVMValueRef index)
1018 {
1019         return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1020 }
1021
1022 LLVMValueRef
1023 ac_build_gep0(struct ac_llvm_context *ctx,
1024               LLVMValueRef base_ptr,
1025               LLVMValueRef index)
1026 {
1027         LLVMValueRef indices[2] = {
1028                 ctx->i32_0,
1029                 index,
1030         };
1031         return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1032 }
1033
1034 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
1035                                   LLVMValueRef index)
1036 {
1037         return LLVMBuildPointerCast(ctx->builder,
1038                                     LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1039                                     LLVMTypeOf(ptr), "");
1040 }
1041
1042 void
1043 ac_build_indexed_store(struct ac_llvm_context *ctx,
1044                        LLVMValueRef base_ptr, LLVMValueRef index,
1045                        LLVMValueRef value)
1046 {
1047         LLVMBuildStore(ctx->builder, value,
1048                        ac_build_gep0(ctx, base_ptr, index));
1049 }
1050
1051 /**
1052  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1053  * It's equivalent to doing a load from &base_ptr[index].
1054  *
1055  * \param base_ptr  Where the array starts.
1056  * \param index     The element index into the array.
1057  * \param uniform   Whether the base_ptr and index can be assumed to be
1058  *                  dynamically uniform (i.e. load to an SGPR)
1059  * \param invariant Whether the load is invariant (no other opcodes affect it)
1060  * \param no_unsigned_wraparound
1061  *    For all possible re-associations and re-distributions of an expression
1062  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1063  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1064  *    does not result in an unsigned integer wraparound. This is used for
1065  *    optimal code generation of 32-bit pointer arithmetic.
1066  *
1067  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1068  *    integer wraparound can't be an imm offset in s_load_dword, because
1069  *    the instruction performs "addr + offset" in 64 bits.
1070  *
1071  *    Expected usage for bindless textures by chaining GEPs:
1072  *      // possible unsigned wraparound, don't use InBounds:
1073  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1074  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1075  *
1076  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1077  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1078  */
1079 static LLVMValueRef
1080 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1081                      LLVMValueRef index, bool uniform, bool invariant,
1082                      bool no_unsigned_wraparound)
1083 {
1084         LLVMValueRef pointer, result;
1085
1086         if (no_unsigned_wraparound &&
1087             LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1088                 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1089         else
1090                 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1091
1092         if (uniform)
1093                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1094         result = LLVMBuildLoad(ctx->builder, pointer, "");
1095         if (invariant)
1096                 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1097         return result;
1098 }
1099
1100 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1101                            LLVMValueRef index)
1102 {
1103         return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1104 }
1105
1106 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1107                                      LLVMValueRef base_ptr, LLVMValueRef index)
1108 {
1109         return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1110 }
1111
1112 /* This assumes that there is no unsigned integer wraparound during the address
1113  * computation, excluding all GEPs within base_ptr. */
1114 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1115                                    LLVMValueRef base_ptr, LLVMValueRef index)
1116 {
1117         return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1118 }
1119
1120 /* See ac_build_load_custom() documentation. */
1121 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1122                                    LLVMValueRef base_ptr, LLVMValueRef index)
1123 {
1124         return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1125 }
1126
1127 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx,
1128                                       unsigned cache_policy)
1129 {
1130         return cache_policy |
1131                (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1132 }
1133
1134 static void
1135 ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
1136                                    LLVMValueRef rsrc,
1137                                    LLVMValueRef data,
1138                                    LLVMValueRef vindex,
1139                                    LLVMValueRef voffset,
1140                                    unsigned num_channels,
1141                                    unsigned cache_policy,
1142                                    bool use_format)
1143 {
1144         LLVMValueRef args[] = {
1145                 data,
1146                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1147                 vindex ? vindex : ctx->i32_0,
1148                 voffset,
1149                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1150                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1151         };
1152         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1153
1154         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1155         char name[256];
1156
1157         if (use_format) {
1158                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
1159                          type_names[func]);
1160         } else {
1161                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
1162                          type_names[func]);
1163         }
1164
1165         ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
1166                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1167 }
1168
1169 static void
1170 ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
1171                                    LLVMValueRef rsrc,
1172                                    LLVMValueRef data,
1173                                    LLVMValueRef vindex,
1174                                    LLVMValueRef voffset,
1175                                    LLVMValueRef soffset,
1176                                    unsigned num_channels,
1177                                    LLVMTypeRef return_channel_type,
1178                                    unsigned cache_policy,
1179                                    bool use_format,
1180                                    bool structurized)
1181 {
1182         LLVMValueRef args[6];
1183         int idx = 0;
1184         args[idx++] = data;
1185         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1186         if (structurized)
1187                 args[idx++] = vindex ? vindex : ctx->i32_0;
1188         args[idx++] = voffset ? voffset : ctx->i32_0;
1189         args[idx++] = soffset ? soffset : ctx->i32_0;
1190         args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1191         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1192         const char *indexing_kind = structurized ? "struct" : "raw";
1193         char name[256], type_name[8];
1194
1195         LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
1196         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1197
1198         if (use_format) {
1199                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1200                          indexing_kind, type_name);
1201         } else {
1202                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1203                          indexing_kind, type_name);
1204         }
1205
1206         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1207                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1208 }
1209
1210 void
1211 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1212                              LLVMValueRef rsrc,
1213                              LLVMValueRef data,
1214                              LLVMValueRef vindex,
1215                              LLVMValueRef voffset,
1216                              unsigned num_channels,
1217                              unsigned cache_policy)
1218 {
1219         if (HAVE_LLVM >= 0x800) {
1220                 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
1221                                                    voffset, NULL, num_channels,
1222                                                    ctx->f32, cache_policy,
1223                                                    true, true);
1224         } else {
1225                 ac_build_llvm7_buffer_store_common(ctx, rsrc, data, vindex, voffset,
1226                                                    num_channels, cache_policy,
1227                                                    true);
1228         }
1229 }
1230
1231 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1232  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1233  * or v4i32 (num_channels=3,4).
1234  */
1235 void
1236 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1237                             LLVMValueRef rsrc,
1238                             LLVMValueRef vdata,
1239                             unsigned num_channels,
1240                             LLVMValueRef voffset,
1241                             LLVMValueRef soffset,
1242                             unsigned inst_offset,
1243                             unsigned cache_policy,
1244                             bool swizzle_enable_hint)
1245 {
1246         /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1247          * intrinsics. */
1248         if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1249                 LLVMValueRef v[3], v01;
1250
1251                 for (int i = 0; i < 3; i++) {
1252                         v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1253                                         LLVMConstInt(ctx->i32, i, 0), "");
1254                 }
1255                 v01 = ac_build_gather_values(ctx, v, 2);
1256
1257                 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1258                                             soffset, inst_offset, cache_policy,
1259                                             swizzle_enable_hint);
1260                 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1261                                             soffset, inst_offset + 8,
1262                                             cache_policy,
1263                                             swizzle_enable_hint);
1264                 return;
1265         }
1266
1267         /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1268          * (voffset is swizzled, but soffset isn't swizzled).
1269          * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1270          */
1271         if (!swizzle_enable_hint) {
1272                 LLVMValueRef offset = soffset;
1273
1274                 if (inst_offset)
1275                         offset = LLVMBuildAdd(ctx->builder, offset,
1276                                               LLVMConstInt(ctx->i32, inst_offset, 0), "");
1277
1278                 if (HAVE_LLVM >= 0x800) {
1279                         ac_build_llvm8_buffer_store_common(ctx, rsrc,
1280                                                            ac_to_float(ctx, vdata),
1281                                                            ctx->i32_0,
1282                                                            voffset, offset,
1283                                                            num_channels,
1284                                                            ctx->f32,
1285                                                            cache_policy,
1286                                                            false, false);
1287                 } else {
1288                         if (voffset)
1289                                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1290
1291                         ac_build_llvm7_buffer_store_common(ctx, rsrc,
1292                                                            ac_to_float(ctx, vdata),
1293                                                            ctx->i32_0, offset,
1294                                                            num_channels, cache_policy,
1295                                                            false);
1296                 }
1297                 return;
1298         }
1299
1300         static const unsigned dfmts[] = {
1301                 V_008F0C_BUF_DATA_FORMAT_32,
1302                 V_008F0C_BUF_DATA_FORMAT_32_32,
1303                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1304                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1305         };
1306         unsigned dfmt = dfmts[num_channels - 1];
1307         unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1308         LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1309
1310         ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1311                                    immoffset, num_channels, dfmt, nfmt, cache_policy);
1312 }
1313
1314 static LLVMValueRef
1315 ac_build_llvm7_buffer_load_common(struct ac_llvm_context *ctx,
1316                                   LLVMValueRef rsrc,
1317                                   LLVMValueRef vindex,
1318                                   LLVMValueRef voffset,
1319                                   unsigned num_channels,
1320                                   unsigned cache_policy,
1321                                   bool can_speculate,
1322                                   bool use_format)
1323 {
1324         LLVMValueRef args[] = {
1325                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1326                 vindex ? vindex : ctx->i32_0,
1327                 voffset,
1328                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1329                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1330         };
1331         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1332
1333         LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1334         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1335         char name[256];
1336
1337         if (use_format) {
1338                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1339                          type_names[func]);
1340         } else {
1341                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1342                          type_names[func]);
1343         }
1344
1345         return ac_build_intrinsic(ctx, name, types[func], args,
1346                                   ARRAY_SIZE(args),
1347                                   ac_get_load_intr_attribs(can_speculate));
1348 }
1349
1350 static LLVMValueRef
1351 ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
1352                                   LLVMValueRef rsrc,
1353                                   LLVMValueRef vindex,
1354                                   LLVMValueRef voffset,
1355                                   LLVMValueRef soffset,
1356                                   unsigned num_channels,
1357                                   LLVMTypeRef channel_type,
1358                                   unsigned cache_policy,
1359                                   bool can_speculate,
1360                                   bool use_format,
1361                                   bool structurized)
1362 {
1363         LLVMValueRef args[5];
1364         int idx = 0;
1365         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1366         if (structurized)
1367                 args[idx++] = vindex ? vindex : ctx->i32_0;
1368         args[idx++] = voffset ? voffset : ctx->i32_0;
1369         args[idx++] = soffset ? soffset : ctx->i32_0;
1370         args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1371         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1372         const char *indexing_kind = structurized ? "struct" : "raw";
1373         char name[256], type_name[8];
1374
1375         LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1376         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1377
1378         if (use_format) {
1379                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1380                          indexing_kind, type_name);
1381         } else {
1382                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1383                          indexing_kind, type_name);
1384         }
1385
1386         return ac_build_intrinsic(ctx, name, type, args, idx,
1387                                   ac_get_load_intr_attribs(can_speculate));
1388 }
1389
1390 LLVMValueRef
1391 ac_build_buffer_load(struct ac_llvm_context *ctx,
1392                      LLVMValueRef rsrc,
1393                      int num_channels,
1394                      LLVMValueRef vindex,
1395                      LLVMValueRef voffset,
1396                      LLVMValueRef soffset,
1397                      unsigned inst_offset,
1398                      unsigned cache_policy,
1399                      bool can_speculate,
1400                      bool allow_smem)
1401 {
1402         LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1403         if (voffset)
1404                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1405         if (soffset)
1406                 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1407
1408         if (allow_smem && !(cache_policy & ac_slc) &&
1409             (!(cache_policy & ac_glc) || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= GFX8))) {
1410                 assert(vindex == NULL);
1411
1412                 LLVMValueRef result[8];
1413
1414                 for (int i = 0; i < num_channels; i++) {
1415                         if (i) {
1416                                 offset = LLVMBuildAdd(ctx->builder, offset,
1417                                                       LLVMConstInt(ctx->i32, 4, 0), "");
1418                         }
1419                         const char *intrname =
1420                                 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
1421                                                     : "llvm.SI.load.const.v4i32";
1422                         unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
1423                         LLVMValueRef args[3] = {
1424                                 rsrc,
1425                                 offset,
1426                                 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1427                         };
1428                         result[i] = ac_build_intrinsic(ctx, intrname,
1429                                                        ctx->f32, args, num_args,
1430                                                        AC_FUNC_ATTR_READNONE |
1431                                                        (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
1432                 }
1433                 if (num_channels == 1)
1434                         return result[0];
1435
1436                 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1437                         result[num_channels++] = LLVMGetUndef(ctx->f32);
1438                 return ac_build_gather_values(ctx, result, num_channels);
1439         }
1440
1441         if (HAVE_LLVM >= 0x0800) {
1442                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
1443                                                          offset, ctx->i32_0,
1444                                                          num_channels, ctx->f32,
1445                                                          cache_policy,
1446                                                          can_speculate, false,
1447                                                          false);
1448         }
1449
1450         return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, offset,
1451                                                  num_channels, cache_policy,
1452                                                  can_speculate, false);
1453 }
1454
1455 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1456                                          LLVMValueRef rsrc,
1457                                          LLVMValueRef vindex,
1458                                          LLVMValueRef voffset,
1459                                          unsigned num_channels,
1460                                          unsigned cache_policy,
1461                                          bool can_speculate)
1462 {
1463         if (HAVE_LLVM >= 0x800) {
1464                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1465                                                          num_channels, ctx->f32,
1466                                                          cache_policy, can_speculate, true, true);
1467         }
1468         return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, voffset,
1469                                                  num_channels, cache_policy,
1470                                                  can_speculate, true);
1471 }
1472
1473 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1474                                                   LLVMValueRef rsrc,
1475                                                   LLVMValueRef vindex,
1476                                                   LLVMValueRef voffset,
1477                                                   unsigned num_channels,
1478                                                   unsigned cache_policy,
1479                                                   bool can_speculate)
1480 {
1481         if (HAVE_LLVM >= 0x800) {
1482                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1483                                                          num_channels, ctx->f32,
1484                                                          cache_policy, can_speculate, true, true);
1485         }
1486
1487         LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1488         LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
1489         stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1490
1491         LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1492                                                       LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1493                                                       elem_count, stride, "");
1494
1495         LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1496                                                        LLVMConstInt(ctx->i32, 2, 0), "");
1497
1498         return ac_build_llvm7_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1499                                                  num_channels, cache_policy,
1500                                                  can_speculate, true);
1501 }
1502
1503 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
1504 /// value for LLVM8+ tbuffer intrinsics.
1505 static unsigned
1506 ac_get_tbuffer_format(struct ac_llvm_context *ctx,
1507                       unsigned dfmt, unsigned nfmt)
1508 {
1509         if (ctx->chip_class >= GFX10) {
1510                 unsigned format;
1511                 switch (dfmt) {
1512                 default: unreachable("bad dfmt");
1513                 case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break;
1514                 case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
1515                 case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
1516                 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
1517                 case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
1518                 case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
1519                 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
1520                 case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
1521                 case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
1522                 case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break;
1523                 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
1524                 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break;
1525                 }
1526
1527                 // Use the regularity properties of the combined format enum.
1528                 //
1529                 // Note: float is incompatible with 8-bit data formats,
1530                 //       [us]{norm,scaled} are incomparible with 32-bit data formats.
1531                 //       [us]scaled are not writable.
1532                 switch (nfmt) {
1533                 case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
1534                 case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
1535                 case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
1536                 case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
1537                 default: unreachable("bad nfmt");
1538                 case V_008F0C_BUF_NUM_FORMAT_UINT: break;
1539                 case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
1540                 case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
1541                 }
1542
1543                 return format;
1544         } else {
1545                 return dfmt | (nfmt << 4);
1546         }
1547 }
1548
1549 static LLVMValueRef
1550 ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
1551                             LLVMValueRef rsrc,
1552                             LLVMValueRef vindex,
1553                             LLVMValueRef voffset,
1554                             LLVMValueRef soffset,
1555                             unsigned num_channels,
1556                             unsigned dfmt,
1557                             unsigned nfmt,
1558                             unsigned cache_policy,
1559                             bool can_speculate,
1560                             bool structurized)
1561 {
1562         LLVMValueRef args[6];
1563         int idx = 0;
1564         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1565         if (structurized)
1566                 args[idx++] = vindex ? vindex : ctx->i32_0;
1567         args[idx++] = voffset ? voffset : ctx->i32_0;
1568         args[idx++] = soffset ? soffset : ctx->i32_0;
1569         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
1570         args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1571         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1572         const char *indexing_kind = structurized ? "struct" : "raw";
1573         char name[256], type_name[8];
1574
1575         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1576         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1577
1578         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1579                  indexing_kind, type_name);
1580
1581         return ac_build_intrinsic(ctx, name, type, args, idx,
1582                                   ac_get_load_intr_attribs(can_speculate));
1583 }
1584
1585 static LLVMValueRef
1586 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1587                             LLVMValueRef rsrc,
1588                             LLVMValueRef vindex,
1589                             LLVMValueRef voffset,
1590                             LLVMValueRef soffset,
1591                             LLVMValueRef immoffset,
1592                             unsigned num_channels,
1593                             unsigned dfmt,
1594                             unsigned nfmt,
1595                             unsigned cache_policy,
1596                             bool can_speculate,
1597                             bool structurized) /* only matters for LLVM 8+ */
1598 {
1599         if (HAVE_LLVM >= 0x800) {
1600                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1601
1602                 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
1603                                                    soffset, num_channels,
1604                                                    dfmt, nfmt, cache_policy,
1605                                                    can_speculate, structurized);
1606         }
1607
1608         LLVMValueRef args[] = {
1609                 rsrc,
1610                 vindex ? vindex : ctx->i32_0,
1611                 voffset,
1612                 soffset,
1613                 immoffset,
1614                 LLVMConstInt(ctx->i32, dfmt, false),
1615                 LLVMConstInt(ctx->i32, nfmt, false),
1616                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
1617                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
1618         };
1619         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1620         LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1621         const char *type_names[] = {"i32", "v2i32", "v4i32"};
1622         char name[256];
1623
1624         snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
1625                  type_names[func]);
1626
1627         return ac_build_intrinsic(ctx, name, types[func], args, 9,
1628                                   ac_get_load_intr_attribs(can_speculate));
1629 }
1630
1631 LLVMValueRef
1632 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1633                              LLVMValueRef rsrc,
1634                              LLVMValueRef vindex,
1635                              LLVMValueRef voffset,
1636                              LLVMValueRef soffset,
1637                              LLVMValueRef immoffset,
1638                              unsigned num_channels,
1639                              unsigned dfmt,
1640                              unsigned nfmt,
1641                              unsigned cache_policy,
1642                              bool can_speculate)
1643 {
1644         return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1645                                      immoffset, num_channels, dfmt, nfmt,
1646                                      cache_policy, can_speculate, true);
1647 }
1648
1649 LLVMValueRef
1650 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1651                           LLVMValueRef rsrc,
1652                           LLVMValueRef voffset,
1653                           LLVMValueRef soffset,
1654                           LLVMValueRef immoffset,
1655                           unsigned num_channels,
1656                           unsigned dfmt,
1657                           unsigned nfmt,
1658                           unsigned cache_policy,
1659                           bool can_speculate)
1660 {
1661         return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1662                                      immoffset, num_channels, dfmt, nfmt,
1663                                      cache_policy, can_speculate, false);
1664 }
1665
1666 LLVMValueRef
1667 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1668                             LLVMValueRef rsrc,
1669                             LLVMValueRef voffset,
1670                             LLVMValueRef soffset,
1671                             LLVMValueRef immoffset,
1672                             unsigned cache_policy)
1673 {
1674         LLVMValueRef res;
1675
1676         if (HAVE_LLVM >= 0x900) {
1677                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1678
1679                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1680                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1681                                                         voffset, soffset,
1682                                                         1, ctx->i16, cache_policy,
1683                                                         false, false, false);
1684         } else {
1685                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1686                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1687
1688                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1689                                                 immoffset, 1, dfmt, nfmt, cache_policy,
1690                                                 false);
1691
1692                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1693         }
1694
1695         return res;
1696 }
1697
1698 LLVMValueRef
1699 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1700                            LLVMValueRef rsrc,
1701                            LLVMValueRef voffset,
1702                            LLVMValueRef soffset,
1703                            LLVMValueRef immoffset,
1704                            unsigned cache_policy)
1705 {
1706         LLVMValueRef res;
1707
1708         if (HAVE_LLVM >= 0x900) {
1709                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1710
1711                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1712                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1713                                                         voffset, soffset,
1714                                                         1, ctx->i8, cache_policy,
1715                                                         false, false, false);
1716         } else {
1717                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1718                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1719
1720                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1721                                                 immoffset, 1, dfmt, nfmt, cache_policy,
1722                                                 false);
1723
1724                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1725         }
1726
1727         return res;
1728 }
1729
1730 /**
1731  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1732  *
1733  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1734  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1735  */
1736 static LLVMValueRef
1737 ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits)
1738 {
1739         assert(LLVMTypeOf(src) == ctx->i32);
1740
1741         LLVMValueRef tmp;
1742         LLVMValueRef mantissa;
1743         mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1744
1745         /* Converting normal numbers is just a shift + correcting the exponent bias */
1746         unsigned normal_shift = 23 - mant_bits;
1747         unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1748         LLVMValueRef shifted, normal;
1749
1750         shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1751         normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1752
1753         /* Converting nan/inf numbers is the same, but with a different exponent update */
1754         LLVMValueRef naninf;
1755         naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1756
1757         /* Converting denormals is the complex case: determine the leading zeros of the
1758          * mantissa to obtain the correct shift for the mantissa and exponent correction.
1759          */
1760         LLVMValueRef denormal;
1761         LLVMValueRef params[2] = {
1762                 mantissa,
1763                 ctx->i1true, /* result can be undef when arg is 0 */
1764         };
1765         LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32,
1766                                               params, 2, AC_FUNC_ATTR_READNONE);
1767
1768         /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1769         tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1770         denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1771
1772         unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1773         tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1774         tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1775         denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1776
1777         /* Select the final result. */
1778         LLVMValueRef result;
1779
1780         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1781                             LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1782         result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1783
1784         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1785                             LLVMConstInt(ctx->i32, 1 << mant_bits, false), "");
1786         result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1787
1788         tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1789         result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1790
1791         return ac_to_float(ctx, result);
1792 }
1793
1794 /**
1795  * Generate a fully general open coded buffer format fetch with all required
1796  * fixups suitable for vertex fetch, using non-format buffer loads.
1797  *
1798  * Some combinations of argument values have special interpretations:
1799  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1800  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1801  *
1802  * \param log_size log(size of channel in bytes)
1803  * \param num_channels number of channels (1 to 4)
1804  * \param format AC_FETCH_FORMAT_xxx value
1805  * \param reverse whether XYZ channels are reversed
1806  * \param known_aligned whether the source is known to be aligned to hardware's
1807  *                      effective element size for loading the given format
1808  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1809  * \param rsrc buffer resource descriptor
1810  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1811  */
1812 LLVMValueRef
1813 ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
1814                                unsigned log_size,
1815                                unsigned num_channels,
1816                                unsigned format,
1817                                bool reverse,
1818                                bool known_aligned,
1819                                LLVMValueRef rsrc,
1820                                LLVMValueRef vindex,
1821                                LLVMValueRef voffset,
1822                                LLVMValueRef soffset,
1823                                unsigned cache_policy,
1824                                bool can_speculate)
1825 {
1826         LLVMValueRef tmp;
1827         unsigned load_log_size = log_size;
1828         unsigned load_num_channels = num_channels;
1829         if (log_size == 3) {
1830                 load_log_size = 2;
1831                 if (format == AC_FETCH_FORMAT_FLOAT) {
1832                         load_num_channels = 2 * num_channels;
1833                 } else {
1834                         load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1835                 }
1836         }
1837
1838         int log_recombine = 0;
1839         if (ctx->chip_class == GFX6 && !known_aligned) {
1840                 /* Avoid alignment restrictions by loading one byte at a time. */
1841                 load_num_channels <<= load_log_size;
1842                 log_recombine = load_log_size;
1843                 load_log_size = 0;
1844         } else if (load_num_channels == 2 || load_num_channels == 4) {
1845                 log_recombine = -util_logbase2(load_num_channels);
1846                 load_num_channels = 1;
1847                 load_log_size += -log_recombine;
1848         }
1849
1850         assert(load_log_size >= 2 || HAVE_LLVM >= 0x0900);
1851
1852         LLVMValueRef loads[32]; /* up to 32 bytes */
1853         for (unsigned i = 0; i < load_num_channels; ++i) {
1854                 tmp = LLVMBuildAdd(ctx->builder, soffset,
1855                                    LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1856                 if (HAVE_LLVM >= 0x0800) {
1857                         LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 :
1858                                                    load_log_size == 1 ? ctx->i16 : ctx->i32;
1859                         unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1860                         loads[i] = ac_build_llvm8_buffer_load_common(
1861                                         ctx, rsrc, vindex, voffset, tmp,
1862                                         num_channels, channel_type, cache_policy,
1863                                         can_speculate, false, true);
1864                 } else {
1865                         tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, "");
1866                         loads[i] = ac_build_llvm7_buffer_load_common(
1867                                         ctx, rsrc, vindex, tmp,
1868                                         1 << (load_log_size - 2), cache_policy, can_speculate, false);
1869                 }
1870                 if (load_log_size >= 2)
1871                         loads[i] = ac_to_integer(ctx, loads[i]);
1872         }
1873
1874         if (log_recombine > 0) {
1875                 /* Recombine bytes if necessary (GFX6 only) */
1876                 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1877
1878                 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1879                         LLVMValueRef accum = NULL;
1880                         for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1881                                 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1882                                 if (i == 0) {
1883                                         accum = tmp;
1884                                 } else {
1885                                         tmp = LLVMBuildShl(ctx->builder, tmp,
1886                                                            LLVMConstInt(dst_type, 8 * i, false), "");
1887                                         accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1888                                 }
1889                         }
1890                         loads[dst] = accum;
1891                 }
1892         } else if (log_recombine < 0) {
1893                 /* Split vectors of dwords */
1894                 if (load_log_size > 2) {
1895                         assert(load_num_channels == 1);
1896                         LLVMValueRef loaded = loads[0];
1897                         unsigned log_split = load_log_size - 2;
1898                         log_recombine += log_split;
1899                         load_num_channels = 1 << log_split;
1900                         load_log_size = 2;
1901                         for (unsigned i = 0; i < load_num_channels; ++i) {
1902                                 tmp = LLVMConstInt(ctx->i32, i, false);
1903                                 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1904                         }
1905                 }
1906
1907                 /* Further split dwords and shorts if required */
1908                 if (log_recombine < 0) {
1909                         for (unsigned src = load_num_channels,
1910                                       dst = load_num_channels << -log_recombine;
1911                              src > 0; --src) {
1912                                 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1913                                 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1914                                 LLVMValueRef loaded = loads[src - 1];
1915                                 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1916                                 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1917                                         tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1918                                         tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1919                                         loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1920                                 }
1921                         }
1922                 }
1923         }
1924
1925         if (log_size == 3) {
1926                 if (format == AC_FETCH_FORMAT_FLOAT) {
1927                         for (unsigned i = 0; i < num_channels; ++i) {
1928                                 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1929                                 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1930                         }
1931                 } else if (format == AC_FETCH_FORMAT_FIXED) {
1932                         /* 10_11_11_FLOAT */
1933                         LLVMValueRef data = loads[0];
1934                         LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1935                         LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1936                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1937                         LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1938                         LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1939
1940                         loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1941                         loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1942                         loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1943
1944                         num_channels = 3;
1945                         log_size = 2;
1946                         format = AC_FETCH_FORMAT_FLOAT;
1947                 } else {
1948                         /* 2_10_10_10 data formats */
1949                         LLVMValueRef data = loads[0];
1950                         LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1951                         LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1952                         loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1953                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1954                         loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1955                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1956                         loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1957                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1958                         loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1959
1960                         num_channels = 4;
1961                 }
1962         }
1963
1964         if (format == AC_FETCH_FORMAT_FLOAT) {
1965                 if (log_size != 2) {
1966                         for (unsigned chan = 0; chan < num_channels; ++chan) {
1967                                 tmp = ac_to_float(ctx, loads[chan]);
1968                                 if (log_size == 3)
1969                                         tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1970                                 else if (log_size == 1)
1971                                         tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1972                                 loads[chan] = ac_to_integer(ctx, tmp);
1973                         }
1974                 }
1975         } else if (format == AC_FETCH_FORMAT_UINT) {
1976                 if (log_size != 2) {
1977                         for (unsigned chan = 0; chan < num_channels; ++chan)
1978                                 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1979                 }
1980         } else if (format == AC_FETCH_FORMAT_SINT) {
1981                 if (log_size != 2) {
1982                         for (unsigned chan = 0; chan < num_channels; ++chan)
1983                                 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1984                 }
1985         } else {
1986                 bool unsign = format == AC_FETCH_FORMAT_UNORM ||
1987                               format == AC_FETCH_FORMAT_USCALED ||
1988                               format == AC_FETCH_FORMAT_UINT;
1989
1990                 for (unsigned chan = 0; chan < num_channels; ++chan) {
1991                         if (unsign) {
1992                                 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1993                         } else {
1994                                 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1995                         }
1996
1997                         LLVMValueRef scale = NULL;
1998                         if (format == AC_FETCH_FORMAT_FIXED) {
1999                                 assert(log_size == 2);
2000                                 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
2001                         } else if (format == AC_FETCH_FORMAT_UNORM) {
2002                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
2003                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
2004                         } else if (format == AC_FETCH_FORMAT_SNORM) {
2005                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
2006                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
2007                         }
2008                         if (scale)
2009                                 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
2010
2011                         if (format == AC_FETCH_FORMAT_SNORM) {
2012                                 /* Clamp to [-1, 1] */
2013                                 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
2014                                 LLVMValueRef clamp =
2015                                         LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
2016                                 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
2017                         }
2018
2019                         loads[chan] = ac_to_integer(ctx, tmp);
2020                 }
2021         }
2022
2023         while (num_channels < 4) {
2024                 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
2025                         loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
2026                 } else {
2027                         loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
2028                 }
2029                 num_channels++;
2030         }
2031
2032         if (reverse) {
2033                 tmp = loads[0];
2034                 loads[0] = loads[2];
2035                 loads[2] = tmp;
2036         }
2037
2038         return ac_build_gather_values(ctx, loads, 4);
2039 }
2040
2041 static void
2042 ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
2043                              LLVMValueRef rsrc,
2044                              LLVMValueRef vdata,
2045                              LLVMValueRef vindex,
2046                              LLVMValueRef voffset,
2047                              LLVMValueRef soffset,
2048                              unsigned num_channels,
2049                              unsigned dfmt,
2050                              unsigned nfmt,
2051                              unsigned cache_policy,
2052                              bool structurized)
2053 {
2054         LLVMValueRef args[7];
2055         int idx = 0;
2056         args[idx++] = vdata;
2057         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
2058         if (structurized)
2059                 args[idx++] = vindex ? vindex : ctx->i32_0;
2060         args[idx++] = voffset ? voffset : ctx->i32_0;
2061         args[idx++] = soffset ? soffset : ctx->i32_0;
2062         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
2063         args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
2064         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
2065         const char *indexing_kind = structurized ? "struct" : "raw";
2066         char name[256], type_name[8];
2067
2068         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
2069         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
2070
2071         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
2072                  indexing_kind, type_name);
2073
2074         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
2075                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2076 }
2077
2078 static void
2079 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
2080                        LLVMValueRef rsrc,
2081                        LLVMValueRef vdata,
2082                        LLVMValueRef vindex,
2083                        LLVMValueRef voffset,
2084                        LLVMValueRef soffset,
2085                        LLVMValueRef immoffset,
2086                        unsigned num_channels,
2087                        unsigned dfmt,
2088                        unsigned nfmt,
2089                        unsigned cache_policy,
2090                        bool structurized) /* only matters for LLVM 8+ */
2091 {
2092         if (HAVE_LLVM >= 0x800) {
2093                 voffset = LLVMBuildAdd(ctx->builder,
2094                                        voffset ? voffset : ctx->i32_0,
2095                                        immoffset, "");
2096
2097                 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
2098                                              soffset, num_channels, dfmt, nfmt,
2099                                              cache_policy, structurized);
2100         } else {
2101                 LLVMValueRef params[] = {
2102                         vdata,
2103                         rsrc,
2104                         vindex ? vindex : ctx->i32_0,
2105                         voffset ? voffset : ctx->i32_0,
2106                         soffset ? soffset : ctx->i32_0,
2107                         immoffset,
2108                         LLVMConstInt(ctx->i32, dfmt, false),
2109                         LLVMConstInt(ctx->i32, nfmt, false),
2110                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
2111                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
2112                 };
2113                 unsigned func = CLAMP(num_channels, 1, 3) - 1;
2114                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
2115                 char name[256];
2116
2117                 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
2118                          type_names[func]);
2119
2120                 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
2121                                    AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2122         }
2123 }
2124
2125 void
2126 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
2127                               LLVMValueRef rsrc,
2128                               LLVMValueRef vdata,
2129                               LLVMValueRef vindex,
2130                               LLVMValueRef voffset,
2131                               LLVMValueRef soffset,
2132                               LLVMValueRef immoffset,
2133                               unsigned num_channels,
2134                               unsigned dfmt,
2135                               unsigned nfmt,
2136                               unsigned cache_policy)
2137 {
2138         ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
2139                                immoffset, num_channels, dfmt, nfmt, cache_policy,
2140                                true);
2141 }
2142
2143 void
2144 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
2145                            LLVMValueRef rsrc,
2146                            LLVMValueRef vdata,
2147                            LLVMValueRef voffset,
2148                            LLVMValueRef soffset,
2149                            LLVMValueRef immoffset,
2150                            unsigned num_channels,
2151                            unsigned dfmt,
2152                            unsigned nfmt,
2153                            unsigned cache_policy)
2154 {
2155         ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
2156                                immoffset, num_channels, dfmt, nfmt, cache_policy,
2157                                false);
2158 }
2159
2160 void
2161 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
2162                              LLVMValueRef rsrc,
2163                              LLVMValueRef vdata,
2164                              LLVMValueRef voffset,
2165                              LLVMValueRef soffset,
2166                              unsigned cache_policy)
2167 {
2168         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
2169
2170         if (HAVE_LLVM >= 0x900) {
2171                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2172                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2173                                                    voffset, soffset, 1,
2174                                                    ctx->i16, cache_policy,
2175                                                    false, false);
2176         } else {
2177                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
2178                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2179
2180                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2181
2182                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2183                                            ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2184         }
2185 }
2186
2187 void
2188 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
2189                             LLVMValueRef rsrc,
2190                             LLVMValueRef vdata,
2191                             LLVMValueRef voffset,
2192                             LLVMValueRef soffset,
2193                             unsigned cache_policy)
2194 {
2195         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
2196
2197         if (HAVE_LLVM >= 0x900) {
2198                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2199                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2200                                                    voffset, soffset, 1,
2201                                                    ctx->i8, cache_policy,
2202                                                    false, false);
2203         } else {
2204                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
2205                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2206
2207                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2208
2209                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2210                                            ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2211         }
2212 }
2213 /**
2214  * Set range metadata on an instruction.  This can only be used on load and
2215  * call instructions.  If you know an instruction can only produce the values
2216  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
2217  * \p lo is the minimum value inclusive.
2218  * \p hi is the maximum value exclusive.
2219  */
2220 static void set_range_metadata(struct ac_llvm_context *ctx,
2221                                LLVMValueRef value, unsigned lo, unsigned hi)
2222 {
2223         LLVMValueRef range_md, md_args[2];
2224         LLVMTypeRef type = LLVMTypeOf(value);
2225         LLVMContextRef context = LLVMGetTypeContext(type);
2226
2227         md_args[0] = LLVMConstInt(type, lo, false);
2228         md_args[1] = LLVMConstInt(type, hi, false);
2229         range_md = LLVMMDNodeInContext(context, md_args, 2);
2230         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
2231 }
2232
2233 LLVMValueRef
2234 ac_get_thread_id(struct ac_llvm_context *ctx)
2235 {
2236         LLVMValueRef tid;
2237
2238         LLVMValueRef tid_args[2];
2239         tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
2240         tid_args[1] = ctx->i32_0;
2241         tid_args[1] = ac_build_intrinsic(ctx,
2242                                          "llvm.amdgcn.mbcnt.lo", ctx->i32,
2243                                          tid_args, 2, AC_FUNC_ATTR_READNONE);
2244
2245         if (ctx->wave_size == 32) {
2246                 tid = tid_args[1];
2247         } else {
2248                 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
2249                                          ctx->i32, tid_args,
2250                                          2, AC_FUNC_ATTR_READNONE);
2251         }
2252         set_range_metadata(ctx, tid, 0, ctx->wave_size);
2253         return tid;
2254 }
2255
2256 /*
2257  * AMD GCN implements derivatives using the local data store (LDS)
2258  * All writes to the LDS happen in all executing threads at
2259  * the same time. TID is the Thread ID for the current
2260  * thread and is a value between 0 and 63, representing
2261  * the thread's position in the wavefront.
2262  *
2263  * For the pixel shader threads are grouped into quads of four pixels.
2264  * The TIDs of the pixels of a quad are:
2265  *
2266  *  +------+------+
2267  *  |4n + 0|4n + 1|
2268  *  +------+------+
2269  *  |4n + 2|4n + 3|
2270  *  +------+------+
2271  *
2272  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
2273  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
2274  * the current pixel's column, and masking with 0xfffffffe yields the TID
2275  * of the left pixel of the current pixel's row.
2276  *
2277  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
2278  * adding 2 yields the TID of the pixel below the top pixel.
2279  */
2280 LLVMValueRef
2281 ac_build_ddxy(struct ac_llvm_context *ctx,
2282               uint32_t mask,
2283               int idx,
2284               LLVMValueRef val)
2285 {
2286         unsigned tl_lanes[4], trbl_lanes[4];
2287         char name[32], type[8];
2288         LLVMValueRef tl, trbl;
2289         LLVMTypeRef result_type;
2290         LLVMValueRef result;
2291
2292         result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
2293
2294         if (result_type == ctx->f16)
2295                 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
2296
2297         for (unsigned i = 0; i < 4; ++i) {
2298                 tl_lanes[i] = i & mask;
2299                 trbl_lanes[i] = (i & mask) + idx;
2300         }
2301
2302         tl = ac_build_quad_swizzle(ctx, val,
2303                                    tl_lanes[0], tl_lanes[1],
2304                                    tl_lanes[2], tl_lanes[3]);
2305         trbl = ac_build_quad_swizzle(ctx, val,
2306                                      trbl_lanes[0], trbl_lanes[1],
2307                                      trbl_lanes[2], trbl_lanes[3]);
2308
2309         if (result_type == ctx->f16) {
2310                 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
2311                 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
2312         }
2313
2314         tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
2315         trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
2316         result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
2317
2318         ac_build_type_name_for_intr(result_type, type, sizeof(type));
2319         snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
2320
2321         return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
2322 }
2323
2324 void
2325 ac_build_sendmsg(struct ac_llvm_context *ctx,
2326                  uint32_t msg,
2327                  LLVMValueRef wave_id)
2328 {
2329         LLVMValueRef args[2];
2330         args[0] = LLVMConstInt(ctx->i32, msg, false);
2331         args[1] = wave_id;
2332         ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
2333 }
2334
2335 LLVMValueRef
2336 ac_build_imsb(struct ac_llvm_context *ctx,
2337               LLVMValueRef arg,
2338               LLVMTypeRef dst_type)
2339 {
2340         LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
2341                                               dst_type, &arg, 1,
2342                                               AC_FUNC_ATTR_READNONE);
2343
2344         /* The HW returns the last bit index from MSB, but NIR/TGSI wants
2345          * the index from LSB. Invert it by doing "31 - msb". */
2346         msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
2347                            msb, "");
2348
2349         LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
2350         LLVMValueRef cond = LLVMBuildOr(ctx->builder,
2351                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2352                                                       arg, ctx->i32_0, ""),
2353                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2354                                                       arg, all_ones, ""), "");
2355
2356         return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
2357 }
2358
2359 LLVMValueRef
2360 ac_build_umsb(struct ac_llvm_context *ctx,
2361               LLVMValueRef arg,
2362               LLVMTypeRef dst_type)
2363 {
2364         const char *intrin_name;
2365         LLVMTypeRef type;
2366         LLVMValueRef highest_bit;
2367         LLVMValueRef zero;
2368         unsigned bitsize;
2369
2370         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
2371         switch (bitsize) {
2372         case 64:
2373                 intrin_name = "llvm.ctlz.i64";
2374                 type = ctx->i64;
2375                 highest_bit = LLVMConstInt(ctx->i64, 63, false);
2376                 zero = ctx->i64_0;
2377                 break;
2378         case 32:
2379                 intrin_name = "llvm.ctlz.i32";
2380                 type = ctx->i32;
2381                 highest_bit = LLVMConstInt(ctx->i32, 31, false);
2382                 zero = ctx->i32_0;
2383                 break;
2384         case 16:
2385                 intrin_name = "llvm.ctlz.i16";
2386                 type = ctx->i16;
2387                 highest_bit = LLVMConstInt(ctx->i16, 15, false);
2388                 zero = ctx->i16_0;
2389                 break;
2390         case 8:
2391                 intrin_name = "llvm.ctlz.i8";
2392                 type = ctx->i8;
2393                 highest_bit = LLVMConstInt(ctx->i8, 7, false);
2394                 zero = ctx->i8_0;
2395                 break;
2396         default:
2397                 unreachable(!"invalid bitsize");
2398                 break;
2399         }
2400
2401         LLVMValueRef params[2] = {
2402                 arg,
2403                 ctx->i1true,
2404         };
2405
2406         LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
2407                                               params, 2,
2408                                               AC_FUNC_ATTR_READNONE);
2409
2410         /* The HW returns the last bit index from MSB, but TGSI/NIR wants
2411          * the index from LSB. Invert it by doing "31 - msb". */
2412         msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
2413
2414         if (bitsize == 64) {
2415                 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
2416         } else if (bitsize < 32) {
2417                 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
2418         }
2419
2420         /* check for zero */
2421         return LLVMBuildSelect(ctx->builder,
2422                                LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
2423                                LLVMConstInt(ctx->i32, -1, true), msb, "");
2424 }
2425
2426 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
2427                            LLVMValueRef b)
2428 {
2429         char name[64];
2430         snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2431         LLVMValueRef args[2] = {a, b};
2432         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2433                                   AC_FUNC_ATTR_READNONE);
2434 }
2435
2436 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
2437                            LLVMValueRef b)
2438 {
2439         char name[64];
2440         snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2441         LLVMValueRef args[2] = {a, b};
2442         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2443                                   AC_FUNC_ATTR_READNONE);
2444 }
2445
2446 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2447                            LLVMValueRef b)
2448 {
2449         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2450         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2451 }
2452
2453 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2454                            LLVMValueRef b)
2455 {
2456         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2457         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2458 }
2459
2460 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2461                            LLVMValueRef b)
2462 {
2463         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2464         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2465 }
2466
2467 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
2468                            LLVMValueRef b)
2469 {
2470         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
2471         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2472 }
2473
2474 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2475 {
2476         LLVMTypeRef t = LLVMTypeOf(value);
2477         return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2478                              LLVMConstReal(t, 1.0));
2479 }
2480
2481 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2482 {
2483         LLVMValueRef args[9];
2484
2485         args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2486         args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2487
2488         if (a->compr) {
2489                 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2490                 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2491
2492                 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2493                                 v2i16, "");
2494                 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2495                                 v2i16, "");
2496                 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2497                 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2498
2499                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2500                                    ctx->voidt, args, 6, 0);
2501         } else {
2502                 args[2] = a->out[0];
2503                 args[3] = a->out[1];
2504                 args[4] = a->out[2];
2505                 args[5] = a->out[3];
2506                 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2507                 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2508
2509                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2510                                    ctx->voidt, args, 8, 0);
2511         }
2512 }
2513
2514 void ac_build_export_null(struct ac_llvm_context *ctx)
2515 {
2516         struct ac_export_args args;
2517
2518         args.enabled_channels = 0x0; /* enabled channels */
2519         args.valid_mask = 1; /* whether the EXEC mask is valid */
2520         args.done = 1; /* DONE bit */
2521         args.target = V_008DFC_SQ_EXP_NULL;
2522         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2523         args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2524         args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2525         args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2526         args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2527
2528         ac_build_export(ctx, &args);
2529 }
2530
2531 static unsigned ac_num_coords(enum ac_image_dim dim)
2532 {
2533         switch (dim) {
2534         case ac_image_1d:
2535                 return 1;
2536         case ac_image_2d:
2537         case ac_image_1darray:
2538                  return 2;
2539         case ac_image_3d:
2540         case ac_image_cube:
2541         case ac_image_2darray:
2542         case ac_image_2dmsaa:
2543                 return 3;
2544         case ac_image_2darraymsaa:
2545                 return 4;
2546         default:
2547                 unreachable("ac_num_coords: bad dim");
2548         }
2549 }
2550
2551 static unsigned ac_num_derivs(enum ac_image_dim dim)
2552 {
2553         switch (dim) {
2554         case ac_image_1d:
2555         case ac_image_1darray:
2556                 return 2;
2557         case ac_image_2d:
2558         case ac_image_2darray:
2559         case ac_image_cube:
2560                 return 4;
2561         case ac_image_3d:
2562                 return 6;
2563         case ac_image_2dmsaa:
2564         case ac_image_2darraymsaa:
2565         default:
2566                 unreachable("derivatives not supported");
2567         }
2568 }
2569
2570 static const char *get_atomic_name(enum ac_atomic_op op)
2571 {
2572         switch (op) {
2573         case ac_atomic_swap: return "swap";
2574         case ac_atomic_add: return "add";
2575         case ac_atomic_sub: return "sub";
2576         case ac_atomic_smin: return "smin";
2577         case ac_atomic_umin: return "umin";
2578         case ac_atomic_smax: return "smax";
2579         case ac_atomic_umax: return "umax";
2580         case ac_atomic_and: return "and";
2581         case ac_atomic_or: return "or";
2582         case ac_atomic_xor: return "xor";
2583         }
2584         unreachable("bad atomic op");
2585 }
2586
2587 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2588                                    struct ac_image_args *a)
2589 {
2590         const char *overload[3] = { "", "", "" };
2591         unsigned num_overloads = 0;
2592         LLVMValueRef args[18];
2593         unsigned num_args = 0;
2594         enum ac_image_dim dim = a->dim;
2595
2596         assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2597                !a->level_zero);
2598         assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2599                 a->opcode != ac_image_store_mip) ||
2600                a->lod);
2601         assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2602                (!a->compare && !a->offset));
2603         assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2604                 a->opcode == ac_image_get_lod) ||
2605                !a->bias);
2606         assert((a->bias ? 1 : 0) +
2607                (a->lod ? 1 : 0) +
2608                (a->level_zero ? 1 : 0) +
2609                (a->derivs[0] ? 1 : 0) <= 1);
2610
2611         if (a->opcode == ac_image_get_lod) {
2612                 switch (dim) {
2613                 case ac_image_1darray:
2614                         dim = ac_image_1d;
2615                         break;
2616                 case ac_image_2darray:
2617                 case ac_image_cube:
2618                         dim = ac_image_2d;
2619                         break;
2620                 default:
2621                         break;
2622                 }
2623         }
2624
2625         bool sample = a->opcode == ac_image_sample ||
2626                       a->opcode == ac_image_gather4 ||
2627                       a->opcode == ac_image_get_lod;
2628         bool atomic = a->opcode == ac_image_atomic ||
2629                       a->opcode == ac_image_atomic_cmpswap;
2630         bool load = a->opcode == ac_image_sample ||
2631                     a->opcode == ac_image_gather4 ||
2632                     a->opcode == ac_image_load ||
2633                     a->opcode == ac_image_load_mip;
2634         LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2635
2636         if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2637                 args[num_args++] = a->data[0];
2638                 if (a->opcode == ac_image_atomic_cmpswap)
2639                         args[num_args++] = a->data[1];
2640         }
2641
2642         if (!atomic)
2643                 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2644
2645         if (a->offset)
2646                 args[num_args++] = ac_to_integer(ctx, a->offset);
2647         if (a->bias) {
2648                 args[num_args++] = ac_to_float(ctx, a->bias);
2649                 overload[num_overloads++] = ".f32";
2650         }
2651         if (a->compare)
2652                 args[num_args++] = ac_to_float(ctx, a->compare);
2653         if (a->derivs[0]) {
2654                 unsigned count = ac_num_derivs(dim);
2655                 for (unsigned i = 0; i < count; ++i)
2656                         args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2657                 overload[num_overloads++] = ".f32";
2658         }
2659         unsigned num_coords =
2660                 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2661         for (unsigned i = 0; i < num_coords; ++i)
2662                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2663         if (a->lod)
2664                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2665         overload[num_overloads++] = sample ? ".f32" : ".i32";
2666
2667         args[num_args++] = a->resource;
2668         if (sample) {
2669                 args[num_args++] = a->sampler;
2670                 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2671         }
2672
2673         args[num_args++] = ctx->i32_0; /* texfailctrl */
2674         args[num_args++] = LLVMConstInt(ctx->i32,
2675                                         load ? get_load_cache_policy(ctx, a->cache_policy) :
2676                                                a->cache_policy, false);
2677
2678         const char *name;
2679         const char *atomic_subop = "";
2680         switch (a->opcode) {
2681         case ac_image_sample: name = "sample"; break;
2682         case ac_image_gather4: name = "gather4"; break;
2683         case ac_image_load: name = "load"; break;
2684         case ac_image_load_mip: name = "load.mip"; break;
2685         case ac_image_store: name = "store"; break;
2686         case ac_image_store_mip: name = "store.mip"; break;
2687         case ac_image_atomic:
2688                 name = "atomic.";
2689                 atomic_subop = get_atomic_name(a->atomic);
2690                 break;
2691         case ac_image_atomic_cmpswap:
2692                 name = "atomic.";
2693                 atomic_subop = "cmpswap";
2694                 break;
2695         case ac_image_get_lod: name = "getlod"; break;
2696         case ac_image_get_resinfo: name = "getresinfo"; break;
2697         default: unreachable("invalid image opcode");
2698         }
2699
2700         const char *dimname;
2701         switch (dim) {
2702         case ac_image_1d: dimname = "1d"; break;
2703         case ac_image_2d: dimname = "2d"; break;
2704         case ac_image_3d: dimname = "3d"; break;
2705         case ac_image_cube: dimname = "cube"; break;
2706         case ac_image_1darray: dimname = "1darray"; break;
2707         case ac_image_2darray: dimname = "2darray"; break;
2708         case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2709         case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2710         default: unreachable("invalid dim");
2711         }
2712
2713         bool lod_suffix =
2714                 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2715         char intr_name[96];
2716         snprintf(intr_name, sizeof(intr_name),
2717                  "llvm.amdgcn.image.%s%s" /* base name */
2718                  "%s%s%s" /* sample/gather modifiers */
2719                  ".%s.%s%s%s%s", /* dimension and type overloads */
2720                  name, atomic_subop,
2721                  a->compare ? ".c" : "",
2722                  a->bias ? ".b" :
2723                  lod_suffix ? ".l" :
2724                  a->derivs[0] ? ".d" :
2725                  a->level_zero ? ".lz" : "",
2726                  a->offset ? ".o" : "",
2727                  dimname,
2728                  atomic ? "i32" : "v4f32",
2729                  overload[0], overload[1], overload[2]);
2730
2731         LLVMTypeRef retty;
2732         if (atomic)
2733                 retty = ctx->i32;
2734         else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2735                 retty = ctx->voidt;
2736         else
2737                 retty = ctx->v4f32;
2738
2739         LLVMValueRef result =
2740                 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2741                                    a->attributes);
2742         if (!sample && retty == ctx->v4f32) {
2743                 result = LLVMBuildBitCast(ctx->builder, result,
2744                                           ctx->v4i32, "");
2745         }
2746         return result;
2747 }
2748
2749 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2750                                     LLVMValueRef args[2])
2751 {
2752         LLVMTypeRef v2f16 =
2753                 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2754
2755         return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2756                                   args, 2, AC_FUNC_ATTR_READNONE);
2757 }
2758
2759 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2760                                      LLVMValueRef args[2])
2761 {
2762         LLVMValueRef res =
2763                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2764                                    ctx->v2i16, args, 2,
2765                                    AC_FUNC_ATTR_READNONE);
2766         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2767 }
2768
2769 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2770                                      LLVMValueRef args[2])
2771 {
2772         LLVMValueRef res =
2773                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2774                                    ctx->v2i16, args, 2,
2775                                    AC_FUNC_ATTR_READNONE);
2776         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2777 }
2778
2779 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2780 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2781                                  LLVMValueRef args[2], unsigned bits, bool hi)
2782 {
2783         assert(bits == 8 || bits == 10 || bits == 16);
2784
2785         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2786                 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2787         LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2788                 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2789         LLVMValueRef max_alpha =
2790                 bits != 10 ? max_rgb : ctx->i32_1;
2791         LLVMValueRef min_alpha =
2792                 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2793
2794         /* Clamp. */
2795         if (bits != 16) {
2796                 for (int i = 0; i < 2; i++) {
2797                         bool alpha = hi && i == 1;
2798                         args[i] = ac_build_imin(ctx, args[i],
2799                                                 alpha ? max_alpha : max_rgb);
2800                         args[i] = ac_build_imax(ctx, args[i],
2801                                                 alpha ? min_alpha : min_rgb);
2802                 }
2803         }
2804
2805         LLVMValueRef res =
2806                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2807                                    ctx->v2i16, args, 2,
2808                                    AC_FUNC_ATTR_READNONE);
2809         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2810 }
2811
2812 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2813 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2814                                  LLVMValueRef args[2], unsigned bits, bool hi)
2815 {
2816         assert(bits == 8 || bits == 10 || bits == 16);
2817
2818         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2819                 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2820         LLVMValueRef max_alpha =
2821                 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2822
2823         /* Clamp. */
2824         if (bits != 16) {
2825                 for (int i = 0; i < 2; i++) {
2826                         bool alpha = hi && i == 1;
2827                         args[i] = ac_build_umin(ctx, args[i],
2828                                                 alpha ? max_alpha : max_rgb);
2829                 }
2830         }
2831
2832         LLVMValueRef res =
2833                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2834                                    ctx->v2i16, args, 2,
2835                                    AC_FUNC_ATTR_READNONE);
2836         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2837 }
2838
2839 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2840 {
2841         return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2842                                   &i1, 1, AC_FUNC_ATTR_READNONE);
2843 }
2844
2845 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2846 {
2847         ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2848                            &i1, 1, 0);
2849 }
2850
2851 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2852                           LLVMValueRef offset, LLVMValueRef width,
2853                           bool is_signed)
2854 {
2855         LLVMValueRef args[] = {
2856                 input,
2857                 offset,
2858                 width,
2859         };
2860
2861         LLVMValueRef result = ac_build_intrinsic(ctx,
2862                                                  is_signed ? "llvm.amdgcn.sbfe.i32" :
2863                                                              "llvm.amdgcn.ubfe.i32",
2864                                                  ctx->i32, args, 3,
2865                                                  AC_FUNC_ATTR_READNONE);
2866
2867         if (HAVE_LLVM < 0x0800) {
2868                 /* FIXME: LLVM 7+ returns incorrect result when count is 0.
2869                  * https://bugs.freedesktop.org/show_bug.cgi?id=107276
2870                  */
2871                 LLVMValueRef zero = ctx->i32_0;
2872                 LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, width, zero, "");
2873                 result = LLVMBuildSelect(ctx->builder, icond, zero, result, "");
2874         }
2875
2876         return result;
2877 }
2878
2879 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2880                            LLVMValueRef s1, LLVMValueRef s2)
2881 {
2882         return LLVMBuildAdd(ctx->builder,
2883                             LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2884 }
2885
2886 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2887                            LLVMValueRef s1, LLVMValueRef s2)
2888 {
2889         return LLVMBuildFAdd(ctx->builder,
2890                              LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2891 }
2892
2893 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2894 {
2895         if (!wait_flags)
2896                 return;
2897
2898         unsigned lgkmcnt = 63;
2899         unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2900         unsigned vscnt = 63;
2901
2902         if (wait_flags & AC_WAIT_LGKM)
2903                 lgkmcnt = 0;
2904         if (wait_flags & AC_WAIT_VLOAD)
2905                 vmcnt = 0;
2906
2907         if (wait_flags & AC_WAIT_VSTORE) {
2908                 if (ctx->chip_class >= GFX10)
2909                         vscnt = 0;
2910                 else
2911                         vmcnt = 0;
2912         }
2913
2914         /* There is no intrinsic for vscnt(0), so use a fence. */
2915         if ((wait_flags & AC_WAIT_LGKM &&
2916              wait_flags & AC_WAIT_VLOAD &&
2917              wait_flags & AC_WAIT_VSTORE) ||
2918             vscnt == 0) {
2919                 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2920                 return;
2921         }
2922
2923         unsigned simm16 = (lgkmcnt << 8) |
2924                           (7 << 4) | /* expcnt */
2925                           (vmcnt & 0xf) |
2926                           ((vmcnt >> 4) << 14);
2927
2928         LLVMValueRef args[1] = {
2929                 LLVMConstInt(ctx->i32, simm16, false),
2930         };
2931         ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2932                            ctx->voidt, args, 1, 0);
2933 }
2934
2935 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
2936                             LLVMValueRef src1, LLVMValueRef src2,
2937                             unsigned bitsize)
2938 {
2939         LLVMTypeRef type;
2940         char *intr;
2941
2942         if (bitsize == 16) {
2943                 intr = "llvm.amdgcn.fmed3.f16";
2944                 type = ctx->f16;
2945         } else if (bitsize == 32) {
2946                 intr = "llvm.amdgcn.fmed3.f32";
2947                 type = ctx->f32;
2948         } else {
2949                 intr = "llvm.amdgcn.fmed3.f64";
2950                 type = ctx->f64;
2951         }
2952
2953         LLVMValueRef params[] = {
2954                 src0,
2955                 src1,
2956                 src2,
2957         };
2958         return ac_build_intrinsic(ctx, intr, type, params, 3,
2959                                   AC_FUNC_ATTR_READNONE);
2960 }
2961
2962 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2963                             unsigned bitsize)
2964 {
2965         LLVMTypeRef type;
2966         char *intr;
2967
2968         if (bitsize == 16) {
2969                 intr = "llvm.amdgcn.fract.f16";
2970                 type = ctx->f16;
2971         } else if (bitsize == 32) {
2972                 intr = "llvm.amdgcn.fract.f32";
2973                 type = ctx->f32;
2974         } else {
2975                 intr = "llvm.amdgcn.fract.f64";
2976                 type = ctx->f64;
2977         }
2978
2979         LLVMValueRef params[] = {
2980                 src0,
2981         };
2982         return ac_build_intrinsic(ctx, intr, type, params, 1,
2983                                   AC_FUNC_ATTR_READNONE);
2984 }
2985
2986 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2987                             unsigned bitsize)
2988 {
2989         LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2990         LLVMValueRef zero = LLVMConstInt(type, 0, false);
2991         LLVMValueRef one = LLVMConstInt(type, 1, false);
2992
2993         LLVMValueRef cmp, val;
2994         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2995         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2996         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2997         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2998         return val;
2999 }
3000
3001 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
3002                             unsigned bitsize)
3003 {
3004         LLVMValueRef cmp, val, zero, one;
3005         LLVMTypeRef type;
3006
3007         if (bitsize == 16) {
3008                 type = ctx->f16;
3009                 zero = ctx->f16_0;
3010                 one = ctx->f16_1;
3011         } else if (bitsize == 32) {
3012                 type = ctx->f32;
3013                 zero = ctx->f32_0;
3014                 one = ctx->f32_1;
3015         } else {
3016                 type = ctx->f64;
3017                 zero = ctx->f64_0;
3018                 one = ctx->f64_1;
3019         }
3020
3021         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
3022         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
3023         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
3024         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
3025         return val;
3026 }
3027
3028 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
3029 {
3030         LLVMValueRef result;
3031         unsigned bitsize;
3032
3033         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3034
3035         switch (bitsize) {
3036         case 64:
3037                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
3038                                             (LLVMValueRef []) { src0 }, 1,
3039                                             AC_FUNC_ATTR_READNONE);
3040
3041                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3042                 break;
3043         case 32:
3044                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
3045                                             (LLVMValueRef []) { src0 }, 1,
3046                                             AC_FUNC_ATTR_READNONE);
3047                 break;
3048         case 16:
3049                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
3050                                             (LLVMValueRef []) { src0 }, 1,
3051                                             AC_FUNC_ATTR_READNONE);
3052
3053                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3054                 break;
3055         case 8:
3056                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
3057                                             (LLVMValueRef []) { src0 }, 1,
3058                                             AC_FUNC_ATTR_READNONE);
3059
3060                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3061                 break;
3062         default:
3063                 unreachable(!"invalid bitsize");
3064                 break;
3065         }
3066
3067         return result;
3068 }
3069
3070 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
3071                                        LLVMValueRef src0)
3072 {
3073         LLVMValueRef result;
3074         unsigned bitsize;
3075
3076         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3077
3078         switch (bitsize) {
3079         case 64:
3080                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
3081                                             (LLVMValueRef []) { src0 }, 1,
3082                                             AC_FUNC_ATTR_READNONE);
3083
3084                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3085                 break;
3086         case 32:
3087                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
3088                                             (LLVMValueRef []) { src0 }, 1,
3089                                             AC_FUNC_ATTR_READNONE);
3090                 break;
3091         case 16:
3092                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
3093                                             (LLVMValueRef []) { src0 }, 1,
3094                                             AC_FUNC_ATTR_READNONE);
3095
3096                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3097                 break;
3098         case 8:
3099                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
3100                                             (LLVMValueRef []) { src0 }, 1,
3101                                             AC_FUNC_ATTR_READNONE);
3102
3103                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3104                 break;
3105         default:
3106                 unreachable(!"invalid bitsize");
3107                 break;
3108         }
3109
3110         return result;
3111 }
3112
3113 #define AC_EXP_TARGET           0
3114 #define AC_EXP_ENABLED_CHANNELS 1
3115 #define AC_EXP_OUT0             2
3116
3117 enum ac_ir_type {
3118         AC_IR_UNDEF,
3119         AC_IR_CONST,
3120         AC_IR_VALUE,
3121 };
3122
3123 struct ac_vs_exp_chan
3124 {
3125         LLVMValueRef value;
3126         float const_float;
3127         enum ac_ir_type type;
3128 };
3129
3130 struct ac_vs_exp_inst {
3131         unsigned offset;
3132         LLVMValueRef inst;
3133         struct ac_vs_exp_chan chan[4];
3134 };
3135
3136 struct ac_vs_exports {
3137         unsigned num;
3138         struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
3139 };
3140
3141 /* Return true if the PARAM export has been eliminated. */
3142 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
3143                                       uint32_t num_outputs,
3144                                       struct ac_vs_exp_inst *exp)
3145 {
3146         unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
3147         bool is_zero[4] = {}, is_one[4] = {};
3148
3149         for (i = 0; i < 4; i++) {
3150                 /* It's a constant expression. Undef outputs are eliminated too. */
3151                 if (exp->chan[i].type == AC_IR_UNDEF) {
3152                         is_zero[i] = true;
3153                         is_one[i] = true;
3154                 } else if (exp->chan[i].type == AC_IR_CONST) {
3155                         if (exp->chan[i].const_float == 0)
3156                                 is_zero[i] = true;
3157                         else if (exp->chan[i].const_float == 1)
3158                                 is_one[i] = true;
3159                         else
3160                                 return false; /* other constant */
3161                 } else
3162                         return false;
3163         }
3164
3165         /* Only certain combinations of 0 and 1 can be eliminated. */
3166         if (is_zero[0] && is_zero[1] && is_zero[2])
3167                 default_val = is_zero[3] ? 0 : 1;
3168         else if (is_one[0] && is_one[1] && is_one[2])
3169                 default_val = is_zero[3] ? 2 : 3;
3170         else
3171                 return false;
3172
3173         /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
3174         LLVMInstructionEraseFromParent(exp->inst);
3175
3176         /* Change OFFSET to DEFAULT_VAL. */
3177         for (i = 0; i < num_outputs; i++) {
3178                 if (vs_output_param_offset[i] == exp->offset) {
3179                         vs_output_param_offset[i] =
3180                                 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
3181                         break;
3182                 }
3183         }
3184         return true;
3185 }
3186
3187 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
3188                                            uint8_t *vs_output_param_offset,
3189                                            uint32_t num_outputs,
3190                                            struct ac_vs_exports *processed,
3191                                            struct ac_vs_exp_inst *exp)
3192 {
3193         unsigned p, copy_back_channels = 0;
3194
3195         /* See if the output is already in the list of processed outputs.
3196          * The LLVMValueRef comparison relies on SSA.
3197          */
3198         for (p = 0; p < processed->num; p++) {
3199                 bool different = false;
3200
3201                 for (unsigned j = 0; j < 4; j++) {
3202                         struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
3203                         struct ac_vs_exp_chan *c2 = &exp->chan[j];
3204
3205                         /* Treat undef as a match. */
3206                         if (c2->type == AC_IR_UNDEF)
3207                                 continue;
3208
3209                         /* If c1 is undef but c2 isn't, we can copy c2 to c1
3210                          * and consider the instruction duplicated.
3211                          */
3212                         if (c1->type == AC_IR_UNDEF) {
3213                                 copy_back_channels |= 1 << j;
3214                                 continue;
3215                         }
3216
3217                         /* Test whether the channels are not equal. */
3218                         if (c1->type != c2->type ||
3219                             (c1->type == AC_IR_CONST &&
3220                              c1->const_float != c2->const_float) ||
3221                             (c1->type == AC_IR_VALUE &&
3222                              c1->value != c2->value)) {
3223                                 different = true;
3224                                 break;
3225                         }
3226                 }
3227                 if (!different)
3228                         break;
3229
3230                 copy_back_channels = 0;
3231         }
3232         if (p == processed->num)
3233                 return false;
3234
3235         /* If a match was found, but the matching export has undef where the new
3236          * one has a normal value, copy the normal value to the undef channel.
3237          */
3238         struct ac_vs_exp_inst *match = &processed->exp[p];
3239
3240         /* Get current enabled channels mask. */
3241         LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
3242         unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
3243
3244         while (copy_back_channels) {
3245                 unsigned chan = u_bit_scan(&copy_back_channels);
3246
3247                 assert(match->chan[chan].type == AC_IR_UNDEF);
3248                 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
3249                                exp->chan[chan].value);
3250                 match->chan[chan] = exp->chan[chan];
3251
3252                 /* Update number of enabled channels because the original mask
3253                  * is not always 0xf.
3254                  */
3255                 enabled_channels |= (1 << chan);
3256                 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
3257                                LLVMConstInt(ctx->i32, enabled_channels, 0));
3258         }
3259
3260         /* The PARAM export is duplicated. Kill it. */
3261         LLVMInstructionEraseFromParent(exp->inst);
3262
3263         /* Change OFFSET to the matching export. */
3264         for (unsigned i = 0; i < num_outputs; i++) {
3265                 if (vs_output_param_offset[i] == exp->offset) {
3266                         vs_output_param_offset[i] = match->offset;
3267                         break;
3268                 }
3269         }
3270         return true;
3271 }
3272
3273 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
3274                             LLVMValueRef main_fn,
3275                             uint8_t *vs_output_param_offset,
3276                             uint32_t num_outputs,
3277                             uint8_t *num_param_exports)
3278 {
3279         LLVMBasicBlockRef bb;
3280         bool removed_any = false;
3281         struct ac_vs_exports exports;
3282
3283         exports.num = 0;
3284
3285         /* Process all LLVM instructions. */
3286         bb = LLVMGetFirstBasicBlock(main_fn);
3287         while (bb) {
3288                 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
3289
3290                 while (inst) {
3291                         LLVMValueRef cur = inst;
3292                         inst = LLVMGetNextInstruction(inst);
3293                         struct ac_vs_exp_inst exp;
3294
3295                         if (LLVMGetInstructionOpcode(cur) != LLVMCall)
3296                                 continue;
3297
3298                         LLVMValueRef callee = ac_llvm_get_called_value(cur);
3299
3300                         if (!ac_llvm_is_function(callee))
3301                                 continue;
3302
3303                         const char *name = LLVMGetValueName(callee);
3304                         unsigned num_args = LLVMCountParams(callee);
3305
3306                         /* Check if this is an export instruction. */
3307                         if ((num_args != 9 && num_args != 8) ||
3308                             (strcmp(name, "llvm.SI.export") &&
3309                              strcmp(name, "llvm.amdgcn.exp.f32")))
3310                                 continue;
3311
3312                         LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
3313                         unsigned target = LLVMConstIntGetZExtValue(arg);
3314
3315                         if (target < V_008DFC_SQ_EXP_PARAM)
3316                                 continue;
3317
3318                         target -= V_008DFC_SQ_EXP_PARAM;
3319
3320                         /* Parse the instruction. */
3321                         memset(&exp, 0, sizeof(exp));
3322                         exp.offset = target;
3323                         exp.inst = cur;
3324
3325                         for (unsigned i = 0; i < 4; i++) {
3326                                 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
3327
3328                                 exp.chan[i].value = v;
3329
3330                                 if (LLVMIsUndef(v)) {
3331                                         exp.chan[i].type = AC_IR_UNDEF;
3332                                 } else if (LLVMIsAConstantFP(v)) {
3333                                         LLVMBool loses_info;
3334                                         exp.chan[i].type = AC_IR_CONST;
3335                                         exp.chan[i].const_float =
3336                                                 LLVMConstRealGetDouble(v, &loses_info);
3337                                 } else {
3338                                         exp.chan[i].type = AC_IR_VALUE;
3339                                 }
3340                         }
3341
3342                         /* Eliminate constant and duplicated PARAM exports. */
3343                         if (ac_eliminate_const_output(vs_output_param_offset,
3344                                                       num_outputs, &exp) ||
3345                             ac_eliminate_duplicated_output(ctx,
3346                                                            vs_output_param_offset,
3347                                                            num_outputs, &exports,
3348                                                            &exp)) {
3349                                 removed_any = true;
3350                         } else {
3351                                 exports.exp[exports.num++] = exp;
3352                         }
3353                 }
3354                 bb = LLVMGetNextBasicBlock(bb);
3355         }
3356
3357         /* Remove holes in export memory due to removed PARAM exports.
3358          * This is done by renumbering all PARAM exports.
3359          */
3360         if (removed_any) {
3361                 uint8_t old_offset[VARYING_SLOT_MAX];
3362                 unsigned out, i;
3363
3364                 /* Make a copy of the offsets. We need the old version while
3365                  * we are modifying some of them. */
3366                 memcpy(old_offset, vs_output_param_offset,
3367                        sizeof(old_offset));
3368
3369                 for (i = 0; i < exports.num; i++) {
3370                         unsigned offset = exports.exp[i].offset;
3371
3372                         /* Update vs_output_param_offset. Multiple outputs can
3373                          * have the same offset.
3374                          */
3375                         for (out = 0; out < num_outputs; out++) {
3376                                 if (old_offset[out] == offset)
3377                                         vs_output_param_offset[out] = i;
3378                         }
3379
3380                         /* Change the PARAM offset in the instruction. */
3381                         LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
3382                                        LLVMConstInt(ctx->i32,
3383                                                     V_008DFC_SQ_EXP_PARAM + i, 0));
3384                 }
3385                 *num_param_exports = exports.num;
3386         }
3387 }
3388
3389 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
3390 {
3391         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
3392         ac_build_intrinsic(ctx,
3393                            "llvm.amdgcn.init.exec", ctx->voidt,
3394                            &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
3395 }
3396
3397 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
3398 {
3399         unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
3400         ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
3401                                      LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
3402                                      "lds");
3403 }
3404
3405 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
3406                          LLVMValueRef dw_addr)
3407 {
3408         return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
3409 }
3410
3411 void ac_lds_store(struct ac_llvm_context *ctx,
3412                   LLVMValueRef dw_addr,
3413                   LLVMValueRef value)
3414 {
3415         value = ac_to_integer(ctx, value);
3416         ac_build_indexed_store(ctx, ctx->lds,
3417                                dw_addr, value);
3418 }
3419
3420 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
3421                          LLVMTypeRef dst_type,
3422                          LLVMValueRef src0)
3423 {
3424         unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3425         const char *intrin_name;
3426         LLVMTypeRef type;
3427         LLVMValueRef zero;
3428
3429         switch (src0_bitsize) {
3430         case 64:
3431                 intrin_name = "llvm.cttz.i64";
3432                 type = ctx->i64;
3433                 zero = ctx->i64_0;
3434                 break;
3435         case 32:
3436                 intrin_name = "llvm.cttz.i32";
3437                 type = ctx->i32;
3438                 zero = ctx->i32_0;
3439                 break;
3440         case 16:
3441                 intrin_name = "llvm.cttz.i16";
3442                 type = ctx->i16;
3443                 zero = ctx->i16_0;
3444                 break;
3445         case 8:
3446                 intrin_name = "llvm.cttz.i8";
3447                 type = ctx->i8;
3448                 zero = ctx->i8_0;
3449                 break;
3450         default:
3451                 unreachable(!"invalid bitsize");
3452         }
3453
3454         LLVMValueRef params[2] = {
3455                 src0,
3456
3457                 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3458                  * add special code to check for x=0. The reason is that
3459                  * the LLVM behavior for x=0 is different from what we
3460                  * need here. However, LLVM also assumes that ffs(x) is
3461                  * in [0, 31], but GLSL expects that ffs(0) = -1, so
3462                  * a conditional assignment to handle 0 is still required.
3463                  *
3464                  * The hardware already implements the correct behavior.
3465                  */
3466                 ctx->i1true,
3467         };
3468
3469         LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
3470                                               params, 2,
3471                                               AC_FUNC_ATTR_READNONE);
3472
3473         if (src0_bitsize == 64) {
3474                 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3475         } else if (src0_bitsize < 32) {
3476                 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3477         }
3478
3479         /* TODO: We need an intrinsic to skip this conditional. */
3480         /* Check for zero: */
3481         return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
3482                                                            LLVMIntEQ, src0,
3483                                                            zero, ""),
3484                                LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3485 }
3486
3487 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3488 {
3489         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3490 }
3491
3492 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3493 {
3494         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3495 }
3496
3497 static struct ac_llvm_flow *
3498 get_current_flow(struct ac_llvm_context *ctx)
3499 {
3500         if (ctx->flow->depth > 0)
3501                 return &ctx->flow->stack[ctx->flow->depth - 1];
3502         return NULL;
3503 }
3504
3505 static struct ac_llvm_flow *
3506 get_innermost_loop(struct ac_llvm_context *ctx)
3507 {
3508         for (unsigned i = ctx->flow->depth; i > 0; --i) {
3509                 if (ctx->flow->stack[i - 1].loop_entry_block)
3510                         return &ctx->flow->stack[i - 1];
3511         }
3512         return NULL;
3513 }
3514
3515 static struct ac_llvm_flow *
3516 push_flow(struct ac_llvm_context *ctx)
3517 {
3518         struct ac_llvm_flow *flow;
3519
3520         if (ctx->flow->depth >= ctx->flow->depth_max) {
3521                 unsigned new_max = MAX2(ctx->flow->depth << 1,
3522                                         AC_LLVM_INITIAL_CF_DEPTH);
3523
3524                 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
3525                 ctx->flow->depth_max = new_max;
3526         }
3527
3528         flow = &ctx->flow->stack[ctx->flow->depth];
3529         ctx->flow->depth++;
3530
3531         flow->next_block = NULL;
3532         flow->loop_entry_block = NULL;
3533         return flow;
3534 }
3535
3536 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
3537                                 int label_id)
3538 {
3539         char buf[32];
3540         snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3541         LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3542 }
3543
3544 /* Append a basic block at the level of the parent flow.
3545  */
3546 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
3547                                             const char *name)
3548 {
3549         assert(ctx->flow->depth >= 1);
3550
3551         if (ctx->flow->depth >= 2) {
3552                 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
3553
3554                 return LLVMInsertBasicBlockInContext(ctx->context,
3555                                                      flow->next_block, name);
3556         }
3557
3558         LLVMValueRef main_fn =
3559                 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3560         return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3561 }
3562
3563 /* Emit a branch to the given default target for the current block if
3564  * applicable -- that is, if the current block does not already contain a
3565  * branch from a break or continue.
3566  */
3567 static void emit_default_branch(LLVMBuilderRef builder,
3568                                 LLVMBasicBlockRef target)
3569 {
3570         if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3571                  LLVMBuildBr(builder, target);
3572 }
3573
3574 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3575 {
3576         struct ac_llvm_flow *flow = push_flow(ctx);
3577         flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3578         flow->next_block = append_basic_block(ctx, "ENDLOOP");
3579         set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3580         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3581         LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3582 }
3583
3584 void ac_build_break(struct ac_llvm_context *ctx)
3585 {
3586         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3587         LLVMBuildBr(ctx->builder, flow->next_block);
3588 }
3589
3590 void ac_build_continue(struct ac_llvm_context *ctx)
3591 {
3592         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3593         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3594 }
3595
3596 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3597 {
3598         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3599         LLVMBasicBlockRef endif_block;
3600
3601         assert(!current_branch->loop_entry_block);
3602
3603         endif_block = append_basic_block(ctx, "ENDIF");
3604         emit_default_branch(ctx->builder, endif_block);
3605
3606         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3607         set_basicblock_name(current_branch->next_block, "else", label_id);
3608
3609         current_branch->next_block = endif_block;
3610 }
3611
3612 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3613 {
3614         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3615
3616         assert(!current_branch->loop_entry_block);
3617
3618         emit_default_branch(ctx->builder, current_branch->next_block);
3619         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3620         set_basicblock_name(current_branch->next_block, "endif", label_id);
3621
3622         ctx->flow->depth--;
3623 }
3624
3625 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3626 {
3627         struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3628
3629         assert(current_loop->loop_entry_block);
3630
3631         emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3632
3633         LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3634         set_basicblock_name(current_loop->next_block, "endloop", label_id);
3635         ctx->flow->depth--;
3636 }
3637
3638 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3639 {
3640         struct ac_llvm_flow *flow = push_flow(ctx);
3641         LLVMBasicBlockRef if_block;
3642
3643         if_block = append_basic_block(ctx, "IF");
3644         flow->next_block = append_basic_block(ctx, "ELSE");
3645         set_basicblock_name(if_block, "if", label_id);
3646         LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3647         LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3648 }
3649
3650 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3651                  int label_id)
3652 {
3653         LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3654                                           value, ctx->f32_0, "");
3655         ac_build_ifcc(ctx, cond, label_id);
3656 }
3657
3658 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3659                   int label_id)
3660 {
3661         LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3662                                           ac_to_integer(ctx, value),
3663                                           ctx->i32_0, "");
3664         ac_build_ifcc(ctx, cond, label_id);
3665 }
3666
3667 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3668                              const char *name)
3669 {
3670         LLVMBuilderRef builder = ac->builder;
3671         LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3672         LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3673         LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3674         LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3675         LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3676         LLVMValueRef res;
3677
3678         if (first_instr) {
3679                 LLVMPositionBuilderBefore(first_builder, first_instr);
3680         } else {
3681                 LLVMPositionBuilderAtEnd(first_builder, first_block);
3682         }
3683
3684         res = LLVMBuildAlloca(first_builder, type, name);
3685         LLVMDisposeBuilder(first_builder);
3686         return res;
3687 }
3688
3689 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3690                                    LLVMTypeRef type, const char *name)
3691 {
3692         LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3693         LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3694         return ptr;
3695 }
3696
3697 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3698                          LLVMTypeRef type)
3699 {
3700         int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3701         return LLVMBuildBitCast(ctx->builder, ptr,
3702                                 LLVMPointerType(type, addr_space), "");
3703 }
3704
3705 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3706                             unsigned count)
3707 {
3708         unsigned num_components = ac_get_llvm_num_components(value);
3709         if (count == num_components)
3710                 return value;
3711
3712         LLVMValueRef masks[MAX2(count, 2)];
3713         masks[0] = ctx->i32_0;
3714         masks[1] = ctx->i32_1;
3715         for (unsigned i = 2; i < count; i++)
3716                 masks[i] = LLVMConstInt(ctx->i32, i, false);
3717
3718         if (count == 1)
3719                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3720                                                "");
3721
3722         LLVMValueRef swizzle = LLVMConstVector(masks, count);
3723         return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3724 }
3725
3726 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3727                              unsigned rshift, unsigned bitwidth)
3728 {
3729         LLVMValueRef value = param;
3730         if (rshift)
3731                 value = LLVMBuildLShr(ctx->builder, value,
3732                                       LLVMConstInt(ctx->i32, rshift, false), "");
3733
3734         if (rshift + bitwidth < 32) {
3735                 unsigned mask = (1 << bitwidth) - 1;
3736                 value = LLVMBuildAnd(ctx->builder, value,
3737                                      LLVMConstInt(ctx->i32, mask, false), "");
3738         }
3739         return value;
3740 }
3741
3742 /* Adjust the sample index according to FMASK.
3743  *
3744  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3745  * which is the identity mapping. Each nibble says which physical sample
3746  * should be fetched to get that sample.
3747  *
3748  * For example, 0x11111100 means there are only 2 samples stored and
3749  * the second sample covers 3/4 of the pixel. When reading samples 0
3750  * and 1, return physical sample 0 (determined by the first two 0s
3751  * in FMASK), otherwise return physical sample 1.
3752  *
3753  * The sample index should be adjusted as follows:
3754  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3755  */
3756 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3757                               LLVMValueRef *addr, bool is_array_tex)
3758 {
3759         struct ac_image_args fmask_load = {};
3760         fmask_load.opcode = ac_image_load;
3761         fmask_load.resource = fmask;
3762         fmask_load.dmask = 0xf;
3763         fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3764         fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3765
3766         fmask_load.coords[0] = addr[0];
3767         fmask_load.coords[1] = addr[1];
3768         if (is_array_tex)
3769                 fmask_load.coords[2] = addr[2];
3770
3771         LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3772         fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3773                                               ac->i32_0, "");
3774
3775         /* Apply the formula. */
3776         unsigned sample_chan = is_array_tex ? 3 : 2;
3777         LLVMValueRef final_sample;
3778         final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3779                                     LLVMConstInt(ac->i32, 4, 0), "");
3780         final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3781         /* Mask the sample index by 0x7, because 0x8 means an unknown value
3782          * with EQAA, so those will map to 0. */
3783         final_sample = LLVMBuildAnd(ac->builder, final_sample,
3784                                     LLVMConstInt(ac->i32, 0x7, 0), "");
3785
3786         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3787          * resource descriptor is 0 (invalid).
3788          */
3789         LLVMValueRef tmp;
3790         tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3791         tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3792         tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3793
3794         /* Replace the MSAA sample index. */
3795         addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3796                                             addr[sample_chan], "");
3797 }
3798
3799 static LLVMValueRef
3800 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3801 {
3802         ac_build_optimization_barrier(ctx, &src);
3803         return ac_build_intrinsic(ctx,
3804                         lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3805                         LLVMTypeOf(src), (LLVMValueRef []) {
3806                         src, lane },
3807                         lane == NULL ? 1 : 2,
3808                         AC_FUNC_ATTR_READNONE |
3809                         AC_FUNC_ATTR_CONVERGENT);
3810 }
3811
3812 /**
3813  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3814  * @param ctx
3815  * @param src
3816  * @param lane - id of the lane or NULL for the first active lane
3817  * @return value of the lane
3818  */
3819 LLVMValueRef
3820 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3821 {
3822         LLVMTypeRef src_type = LLVMTypeOf(src);
3823         src = ac_to_integer(ctx, src);
3824         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3825         LLVMValueRef ret;
3826
3827         if (bits == 32) {
3828                 ret = _ac_build_readlane(ctx, src, lane);
3829         } else {
3830                 assert(bits % 32 == 0);
3831                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3832                 LLVMValueRef src_vector =
3833                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3834                 ret = LLVMGetUndef(vec_type);
3835                 for (unsigned i = 0; i < bits / 32; i++) {
3836                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3837                                                 LLVMConstInt(ctx->i32, i, 0), "");
3838                         LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3839                         ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3840                                                 LLVMConstInt(ctx->i32, i, 0), "");
3841                 }
3842         }
3843         if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3844                 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3845         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3846 }
3847
3848 LLVMValueRef
3849 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3850 {
3851         if (HAVE_LLVM >= 0x0800) {
3852                 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3853                                           (LLVMValueRef []) {value, lane, src}, 3,
3854                                           AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3855         }
3856
3857         LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
3858                                           ac_get_thread_id(ctx), "");
3859         return LLVMBuildSelect(ctx->builder, pred, value, src, "");
3860 }
3861
3862 LLVMValueRef
3863 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3864 {
3865         if (ctx->wave_size == 32) {
3866                 return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3867                                           (LLVMValueRef []) { mask, ctx->i32_0 },
3868                                           2, AC_FUNC_ATTR_READNONE);
3869         }
3870         LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3871                                                  LLVMVectorType(ctx->i32, 2),
3872                                                  "");
3873         LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3874                                                        ctx->i32_0, "");
3875         LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3876                                                        ctx->i32_1, "");
3877         LLVMValueRef val =
3878                 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3879                                    (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3880                                    2, AC_FUNC_ATTR_READNONE);
3881         val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3882                                  (LLVMValueRef []) { mask_hi, val },
3883                                  2, AC_FUNC_ATTR_READNONE);
3884         return val;
3885 }
3886
3887 enum dpp_ctrl {
3888         _dpp_quad_perm = 0x000,
3889         _dpp_row_sl = 0x100,
3890         _dpp_row_sr = 0x110,
3891         _dpp_row_rr = 0x120,
3892         dpp_wf_sl1 = 0x130,
3893         dpp_wf_rl1 = 0x134,
3894         dpp_wf_sr1 = 0x138,
3895         dpp_wf_rr1 = 0x13C,
3896         dpp_row_mirror = 0x140,
3897         dpp_row_half_mirror = 0x141,
3898         dpp_row_bcast15 = 0x142,
3899         dpp_row_bcast31 = 0x143
3900 };
3901
3902 static inline enum dpp_ctrl
3903 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3904 {
3905         assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3906         return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3907 }
3908
3909 static inline enum dpp_ctrl
3910 dpp_row_sl(unsigned amount)
3911 {
3912         assert(amount > 0 && amount < 16);
3913         return _dpp_row_sl | amount;
3914 }
3915
3916 static inline enum dpp_ctrl
3917 dpp_row_sr(unsigned amount)
3918 {
3919         assert(amount > 0 && amount < 16);
3920         return _dpp_row_sr | amount;
3921 }
3922
3923 static LLVMValueRef
3924 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3925               enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3926               bool bound_ctrl)
3927 {
3928         return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3929                                         LLVMTypeOf(old),
3930                                         (LLVMValueRef[]) {
3931                                                 old, src,
3932                                                 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3933                                                 LLVMConstInt(ctx->i32, row_mask, 0),
3934                                                 LLVMConstInt(ctx->i32, bank_mask, 0),
3935                                                 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3936                                         6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3937 }
3938
3939 static LLVMValueRef
3940 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3941              enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3942              bool bound_ctrl)
3943 {
3944         LLVMTypeRef src_type = LLVMTypeOf(src);
3945         src = ac_to_integer(ctx, src);
3946         old = ac_to_integer(ctx, old);
3947         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3948         LLVMValueRef ret;
3949         if (bits == 32) {
3950                 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3951                                     bank_mask, bound_ctrl);
3952         } else {
3953                 assert(bits % 32 == 0);
3954                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3955                 LLVMValueRef src_vector =
3956                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3957                 LLVMValueRef old_vector =
3958                         LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3959                 ret = LLVMGetUndef(vec_type);
3960                 for (unsigned i = 0; i < bits / 32; i++) {
3961                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3962                                                       LLVMConstInt(ctx->i32, i,
3963                                                                    0), "");
3964                         old = LLVMBuildExtractElement(ctx->builder, old_vector,
3965                                                       LLVMConstInt(ctx->i32, i,
3966                                                                    0), "");
3967                         LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3968                                                               dpp_ctrl,
3969                                                               row_mask,
3970                                                               bank_mask,
3971                                                               bound_ctrl);
3972                         ret = LLVMBuildInsertElement(ctx->builder, ret,
3973                                                      ret_comp,
3974                                                      LLVMConstInt(ctx->i32, i,
3975                                                                   0), "");
3976                 }
3977         }
3978         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3979 }
3980
3981 static LLVMValueRef
3982 _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3983                      bool exchange_rows, bool bound_ctrl)
3984 {
3985         LLVMValueRef args[6] = {
3986                 src,
3987                 src,
3988                 LLVMConstInt(ctx->i32, sel, false),
3989                 LLVMConstInt(ctx->i32, sel >> 32, false),
3990                 ctx->i1true, /* fi */
3991                 bound_ctrl ? ctx->i1true : ctx->i1false,
3992         };
3993         return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
3994                                                      : "llvm.amdgcn.permlane16",
3995                                   ctx->i32, args, 6,
3996                                   AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3997 }
3998
3999 static LLVMValueRef
4000 ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
4001                     bool exchange_rows, bool bound_ctrl)
4002 {
4003         LLVMTypeRef src_type = LLVMTypeOf(src);
4004         src = ac_to_integer(ctx, src);
4005         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
4006         LLVMValueRef ret;
4007         if (bits == 32) {
4008                 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
4009                                            bound_ctrl);
4010         } else {
4011                 assert(bits % 32 == 0);
4012                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
4013                 LLVMValueRef src_vector =
4014                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
4015                 ret = LLVMGetUndef(vec_type);
4016                 for (unsigned i = 0; i < bits / 32; i++) {
4017                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
4018                                                       LLVMConstInt(ctx->i32, i,
4019                                                                    0), "");
4020                         LLVMValueRef ret_comp =
4021                                 _ac_build_permlane16(ctx, src, sel,
4022                                                      exchange_rows,
4023                                                      bound_ctrl);
4024                         ret = LLVMBuildInsertElement(ctx->builder, ret,
4025                                                      ret_comp,
4026                                                      LLVMConstInt(ctx->i32, i,
4027                                                                   0), "");
4028                 }
4029         }
4030         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4031 }
4032
4033 static inline unsigned
4034 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
4035 {
4036         assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
4037         return and_mask | (or_mask << 5) | (xor_mask << 10);
4038 }
4039
4040 static LLVMValueRef
4041 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4042 {
4043         return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
4044                                    LLVMTypeOf(src), (LLVMValueRef []) {
4045                                         src, LLVMConstInt(ctx->i32, mask, 0) },
4046                                    2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4047 }
4048
4049 LLVMValueRef
4050 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4051 {
4052         LLVMTypeRef src_type = LLVMTypeOf(src);
4053         src = ac_to_integer(ctx, src);
4054         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
4055         LLVMValueRef ret;
4056         if (bits == 32) {
4057                 ret = _ac_build_ds_swizzle(ctx, src, mask);
4058         } else {
4059                 assert(bits % 32 == 0);
4060                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
4061                 LLVMValueRef src_vector =
4062                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
4063                 ret = LLVMGetUndef(vec_type);
4064                 for (unsigned i = 0; i < bits / 32; i++) {
4065                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
4066                                                       LLVMConstInt(ctx->i32, i,
4067                                                                    0), "");
4068                         LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
4069                                                                      mask);
4070                         ret = LLVMBuildInsertElement(ctx->builder, ret,
4071                                                      ret_comp,
4072                                                      LLVMConstInt(ctx->i32, i,
4073                                                                   0), "");
4074                 }
4075         }
4076         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4077 }
4078
4079 static LLVMValueRef
4080 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
4081 {
4082         char name[32], type[8];
4083         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4084         snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
4085         return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
4086                                   (LLVMValueRef []) { src }, 1,
4087                                   AC_FUNC_ATTR_READNONE);
4088 }
4089
4090 static LLVMValueRef
4091 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
4092                       LLVMValueRef inactive)
4093 {
4094         char name[33], type[8];
4095         LLVMTypeRef src_type = LLVMTypeOf(src);
4096         src = ac_to_integer(ctx, src);
4097         inactive = ac_to_integer(ctx, inactive);
4098         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4099         snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
4100         LLVMValueRef ret =
4101                 ac_build_intrinsic(ctx, name,
4102                                         LLVMTypeOf(src), (LLVMValueRef []) {
4103                                         src, inactive }, 2,
4104                                         AC_FUNC_ATTR_READNONE |
4105                                         AC_FUNC_ATTR_CONVERGENT);
4106         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4107 }
4108
4109 static LLVMValueRef
4110 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
4111 {
4112         if (type_size == 4) {
4113                 switch (op) {
4114                 case nir_op_iadd: return ctx->i32_0;
4115                 case nir_op_fadd: return ctx->f32_0;
4116                 case nir_op_imul: return ctx->i32_1;
4117                 case nir_op_fmul: return ctx->f32_1;
4118                 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
4119                 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
4120                 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
4121                 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
4122                 case nir_op_umax: return ctx->i32_0;
4123                 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
4124                 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
4125                 case nir_op_ior: return ctx->i32_0;
4126                 case nir_op_ixor: return ctx->i32_0;
4127                 default:
4128                         unreachable("bad reduction intrinsic");
4129                 }
4130         } else { /* type_size == 64bit */
4131                 switch (op) {
4132                 case nir_op_iadd: return ctx->i64_0;
4133                 case nir_op_fadd: return ctx->f64_0;
4134                 case nir_op_imul: return ctx->i64_1;
4135                 case nir_op_fmul: return ctx->f64_1;
4136                 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
4137                 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4138                 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
4139                 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
4140                 case nir_op_umax: return ctx->i64_0;
4141                 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
4142                 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
4143                 case nir_op_ior: return ctx->i64_0;
4144                 case nir_op_ixor: return ctx->i64_0;
4145                 default:
4146                         unreachable("bad reduction intrinsic");
4147                 }
4148         }
4149 }
4150
4151 static LLVMValueRef
4152 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
4153 {
4154         bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
4155         switch (op) {
4156         case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
4157         case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
4158         case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
4159         case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
4160         case nir_op_imin: return LLVMBuildSelect(ctx->builder,
4161                                         LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
4162                                         lhs, rhs, "");
4163         case nir_op_umin: return LLVMBuildSelect(ctx->builder,
4164                                         LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
4165                                         lhs, rhs, "");
4166         case nir_op_fmin: return ac_build_intrinsic(ctx,
4167                                         _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
4168                                         _64bit ? ctx->f64 : ctx->f32,
4169                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4170         case nir_op_imax: return LLVMBuildSelect(ctx->builder,
4171                                         LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
4172                                         lhs, rhs, "");
4173         case nir_op_umax: return LLVMBuildSelect(ctx->builder,
4174                                         LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
4175                                         lhs, rhs, "");
4176         case nir_op_fmax: return ac_build_intrinsic(ctx,
4177                                         _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
4178                                         _64bit ? ctx->f64 : ctx->f32,
4179                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4180         case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
4181         case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
4182         case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
4183         default:
4184                 unreachable("bad reduction intrinsic");
4185         }
4186 }
4187
4188 /**
4189  * \param maxprefix specifies that the result only needs to be correct for a
4190  *     prefix of this many threads
4191  *
4192  * TODO: add inclusive and excluse scan functions for GFX6.
4193  */
4194 static LLVMValueRef
4195 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
4196               unsigned maxprefix, bool inclusive)
4197 {
4198         LLVMValueRef result, tmp;
4199
4200         if (ctx->chip_class >= GFX10) {
4201                 result = inclusive ? src : identity;
4202         } else {
4203                 if (inclusive)
4204                         result = src;
4205                 else
4206                         result = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
4207         }
4208         if (maxprefix <= 1)
4209                 return result;
4210         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
4211         result = ac_build_alu_op(ctx, result, tmp, op);
4212         if (maxprefix <= 2)
4213                 return result;
4214         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
4215         result = ac_build_alu_op(ctx, result, tmp, op);
4216         if (maxprefix <= 3)
4217                 return result;
4218         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
4219         result = ac_build_alu_op(ctx, result, tmp, op);
4220         if (maxprefix <= 4)
4221                 return result;
4222         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4223         result = ac_build_alu_op(ctx, result, tmp, op);
4224         if (maxprefix <= 8)
4225                 return result;
4226         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4227         result = ac_build_alu_op(ctx, result, tmp, op);
4228         if (maxprefix <= 16)
4229                 return result;
4230
4231         if (ctx->chip_class >= GFX10) {
4232                 /* dpp_row_bcast{15,31} are not supported on gfx10. */
4233                 LLVMBuilderRef builder = ctx->builder;
4234                 LLVMValueRef tid = ac_get_thread_id(ctx);
4235                 LLVMValueRef cc;
4236                 /* TODO-GFX10: Can we get better code-gen by putting this into
4237                  * a branch so that LLVM generates EXEC mask manipulations? */
4238                 if (inclusive)
4239                         tmp = result;
4240                 else
4241                         tmp = ac_build_alu_op(ctx, result, src, op);
4242                 tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
4243                 tmp = ac_build_alu_op(ctx, result, tmp, op);
4244                 cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
4245                 cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
4246                 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4247                 if (maxprefix <= 32)
4248                         return result;
4249
4250                 if (inclusive)
4251                         tmp = result;
4252                 else
4253                         tmp = ac_build_alu_op(ctx, result, src, op);
4254                 tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
4255                 tmp = ac_build_alu_op(ctx, result, tmp, op);
4256                 cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
4257                                    LLVMConstInt(ctx->i32, 32, false), "");
4258                 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4259                 return result;
4260         }
4261
4262         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4263         result = ac_build_alu_op(ctx, result, tmp, op);
4264         if (maxprefix <= 32)
4265                 return result;
4266         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4267         result = ac_build_alu_op(ctx, result, tmp, op);
4268         return result;
4269 }
4270
4271 LLVMValueRef
4272 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4273 {
4274         LLVMValueRef result;
4275
4276         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4277                 LLVMBuilderRef builder = ctx->builder;
4278                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4279                 result = ac_build_ballot(ctx, src);
4280                 result = ac_build_mbcnt(ctx, result);
4281                 result = LLVMBuildAdd(builder, result, src, "");
4282                 return result;
4283         }
4284
4285         ac_build_optimization_barrier(ctx, &src);
4286
4287         LLVMValueRef identity =
4288                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4289         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4290                                   LLVMTypeOf(identity), "");
4291         result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4292
4293         return ac_build_wwm(ctx, result);
4294 }
4295
4296 LLVMValueRef
4297 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4298 {
4299         LLVMValueRef result;
4300
4301         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4302                 LLVMBuilderRef builder = ctx->builder;
4303                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4304                 result = ac_build_ballot(ctx, src);
4305                 result = ac_build_mbcnt(ctx, result);
4306                 return result;
4307         }
4308
4309         ac_build_optimization_barrier(ctx, &src);
4310
4311         LLVMValueRef identity =
4312                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4313         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4314                                   LLVMTypeOf(identity), "");
4315         result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4316
4317         return ac_build_wwm(ctx, result);
4318 }
4319
4320 LLVMValueRef
4321 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
4322 {
4323         if (cluster_size == 1) return src;
4324         ac_build_optimization_barrier(ctx, &src);
4325         LLVMValueRef result, swap;
4326         LLVMValueRef identity = get_reduction_identity(ctx, op,
4327                                                                 ac_get_type_size(LLVMTypeOf(src)));
4328         result = LLVMBuildBitCast(ctx->builder,
4329                                                                 ac_build_set_inactive(ctx, src, identity),
4330                                                                 LLVMTypeOf(identity), "");
4331         swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4332         result = ac_build_alu_op(ctx, result, swap, op);
4333         if (cluster_size == 2) return ac_build_wwm(ctx, result);
4334
4335         swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4336         result = ac_build_alu_op(ctx, result, swap, op);
4337         if (cluster_size == 4) return ac_build_wwm(ctx, result);
4338
4339         if (ctx->chip_class >= GFX8)
4340                 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4341         else
4342                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4343         result = ac_build_alu_op(ctx, result, swap, op);
4344         if (cluster_size == 8) return ac_build_wwm(ctx, result);
4345
4346         if (ctx->chip_class >= GFX8)
4347                 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4348         else
4349                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4350         result = ac_build_alu_op(ctx, result, swap, op);
4351         if (cluster_size == 16) return ac_build_wwm(ctx, result);
4352
4353         if (ctx->chip_class >= GFX10)
4354                 swap = ac_build_permlane16(ctx, result, 0, true, false);
4355         else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4356                 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4357         else
4358                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4359         result = ac_build_alu_op(ctx, result, swap, op);
4360         if (cluster_size == 32) return ac_build_wwm(ctx, result);
4361
4362         if (ctx->chip_class >= GFX8) {
4363                 if (ctx->chip_class >= GFX10)
4364                         swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4365                 else
4366                         swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4367                 result = ac_build_alu_op(ctx, result, swap, op);
4368                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4369                 return ac_build_wwm(ctx, result);
4370         } else {
4371                 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4372                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4373                 result = ac_build_alu_op(ctx, result, swap, op);
4374                 return ac_build_wwm(ctx, result);
4375         }
4376 }
4377
4378 /**
4379  * "Top half" of a scan that reduces per-wave values across an entire
4380  * workgroup.
4381  *
4382  * The source value must be present in the highest lane of the wave, and the
4383  * highest lane must be live.
4384  */
4385 void
4386 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4387 {
4388         if (ws->maxwaves <= 1)
4389                 return;
4390
4391         const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4392         LLVMBuilderRef builder = ctx->builder;
4393         LLVMValueRef tid = ac_get_thread_id(ctx);
4394         LLVMValueRef tmp;
4395
4396         tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4397         ac_build_ifcc(ctx, tmp, 1000);
4398         LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4399         ac_build_endif(ctx, 1000);
4400 }
4401
4402 /**
4403  * "Bottom half" of a scan that reduces per-wave values across an entire
4404  * workgroup.
4405  *
4406  * The caller must place a barrier between the top and bottom halves.
4407  */
4408 void
4409 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4410 {
4411         const LLVMTypeRef type = LLVMTypeOf(ws->src);
4412         const LLVMValueRef identity =
4413                 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4414
4415         if (ws->maxwaves <= 1) {
4416                 ws->result_reduce = ws->src;
4417                 ws->result_inclusive = ws->src;
4418                 ws->result_exclusive = identity;
4419                 return;
4420         }
4421         assert(ws->maxwaves <= 32);
4422
4423         LLVMBuilderRef builder = ctx->builder;
4424         LLVMValueRef tid = ac_get_thread_id(ctx);
4425         LLVMBasicBlockRef bbs[2];
4426         LLVMValueRef phivalues_scan[2];
4427         LLVMValueRef tmp, tmp2;
4428
4429         bbs[0] = LLVMGetInsertBlock(builder);
4430         phivalues_scan[0] = LLVMGetUndef(type);
4431
4432         if (ws->enable_reduce)
4433                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4434         else if (ws->enable_inclusive)
4435                 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4436         else
4437                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4438         ac_build_ifcc(ctx, tmp, 1001);
4439         {
4440                 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4441
4442                 ac_build_optimization_barrier(ctx, &tmp);
4443
4444                 bbs[1] = LLVMGetInsertBlock(builder);
4445                 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4446         }
4447         ac_build_endif(ctx, 1001);
4448
4449         const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4450
4451         if (ws->enable_reduce) {
4452                 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4453                 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4454         }
4455         if (ws->enable_inclusive)
4456                 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4457         if (ws->enable_exclusive) {
4458                 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4459                 tmp = ac_build_readlane(ctx, scan, tmp);
4460                 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4461                 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4462         }
4463 }
4464
4465 /**
4466  * Inclusive scan of a per-wave value across an entire workgroup.
4467  *
4468  * This implies an s_barrier instruction.
4469  *
4470  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4471  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4472  * useful manner because of the barrier in the algorithm.)
4473  */
4474 void
4475 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4476 {
4477         ac_build_wg_wavescan_top(ctx, ws);
4478         ac_build_s_barrier(ctx);
4479         ac_build_wg_wavescan_bottom(ctx, ws);
4480 }
4481
4482 /**
4483  * "Top half" of a scan that reduces per-thread values across an entire
4484  * workgroup.
4485  *
4486  * All lanes must be active when this code runs.
4487  */
4488 void
4489 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4490 {
4491         if (ws->enable_exclusive) {
4492                 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4493                 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4494                         ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4495                 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4496         } else {
4497                 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4498         }
4499
4500         bool enable_inclusive = ws->enable_inclusive;
4501         bool enable_exclusive = ws->enable_exclusive;
4502         ws->enable_inclusive = false;
4503         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4504         ac_build_wg_wavescan_top(ctx, ws);
4505         ws->enable_inclusive = enable_inclusive;
4506         ws->enable_exclusive = enable_exclusive;
4507 }
4508
4509 /**
4510  * "Bottom half" of a scan that reduces per-thread values across an entire
4511  * workgroup.
4512  *
4513  * The caller must place a barrier between the top and bottom halves.
4514  */
4515 void
4516 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4517 {
4518         bool enable_inclusive = ws->enable_inclusive;
4519         bool enable_exclusive = ws->enable_exclusive;
4520         ws->enable_inclusive = false;
4521         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4522         ac_build_wg_wavescan_bottom(ctx, ws);
4523         ws->enable_inclusive = enable_inclusive;
4524         ws->enable_exclusive = enable_exclusive;
4525
4526         /* ws->result_reduce is already the correct value */
4527         if (ws->enable_inclusive)
4528                 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4529         if (ws->enable_exclusive)
4530                 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4531 }
4532
4533 /**
4534  * A scan that reduces per-thread values across an entire workgroup.
4535  *
4536  * The caller must ensure that all lanes are active when this code runs
4537  * (WWM is insufficient!), because there is an implied barrier.
4538  */
4539 void
4540 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4541 {
4542         ac_build_wg_scan_top(ctx, ws);
4543         ac_build_s_barrier(ctx);
4544         ac_build_wg_scan_bottom(ctx, ws);
4545 }
4546
4547 LLVMValueRef
4548 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
4549                 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
4550 {
4551         unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4552         if (ctx->chip_class >= GFX8) {
4553                 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4554         } else {
4555                 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4556         }
4557 }
4558
4559 LLVMValueRef
4560 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4561 {
4562         index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4563         return ac_build_intrinsic(ctx,
4564                   "llvm.amdgcn.ds.bpermute", ctx->i32,
4565                   (LLVMValueRef []) {index, src}, 2,
4566                   AC_FUNC_ATTR_READNONE |
4567                   AC_FUNC_ATTR_CONVERGENT);
4568 }
4569
4570 LLVMValueRef
4571 ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
4572                    unsigned bitsize)
4573 {
4574         LLVMTypeRef type;
4575         char *intr;
4576
4577         if (bitsize == 16) {
4578                 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4579                 type = ctx->i16;
4580         } else if (bitsize == 32) {
4581                 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4582                 type = ctx->i32;
4583         } else {
4584                 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4585                 type = ctx->i32;
4586         }
4587
4588         LLVMValueRef params[] = {
4589                 src0,
4590         };
4591         return ac_build_intrinsic(ctx, intr, type, params, 1,
4592                                   AC_FUNC_ATTR_READNONE);
4593 }
4594 LLVMValueRef
4595 ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
4596                     unsigned bitsize)
4597 {
4598         LLVMTypeRef type;
4599         char *intr;
4600
4601         if (bitsize == 16) {
4602                 intr = "llvm.amdgcn.frexp.mant.f16";
4603                 type = ctx->f16;
4604         } else if (bitsize == 32) {
4605                 intr = "llvm.amdgcn.frexp.mant.f32";
4606                 type = ctx->f32;
4607         } else {
4608                 intr = "llvm.amdgcn.frexp.mant.f64";
4609                 type = ctx->f64;
4610         }
4611
4612         LLVMValueRef params[] = {
4613                 src0,
4614         };
4615         return ac_build_intrinsic(ctx, intr, type, params, 1,
4616                                   AC_FUNC_ATTR_READNONE);
4617 }
4618
4619 /*
4620  * this takes an I,J coordinate pair,
4621  * and works out the X and Y derivatives.
4622  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4623  */
4624 LLVMValueRef
4625 ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4626 {
4627         LLVMValueRef result[4], a;
4628         unsigned i;
4629
4630         for (i = 0; i < 2; i++) {
4631                 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
4632                                             LLVMConstInt(ctx->i32, i, false), "");
4633                 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4634                 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4635         }
4636         return ac_build_gather_values(ctx, result, 4);
4637 }
4638
4639 LLVMValueRef
4640 ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4641 {
4642         LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
4643                                                  ctx->i1, NULL, 0,
4644                                                  AC_FUNC_ATTR_READNONE);
4645         result = LLVMBuildNot(ctx->builder, result, "");
4646         return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4647 }
4648
4649 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
4650                            LLVMValueRef *args, unsigned num_args)
4651 {
4652         LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4653         LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4654         return ret;
4655 }