src/amd/llvm/ac_llvm_build.c

   1 /*
   2  * Copyright 2014 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sub license, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  19  *
  20  * The above copyright notice and this permission notice (including the
  21  * next paragraph) shall be included in all copies or substantial portions
  22  * of the Software.
  23  *
  24  */
  25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
  26 #include "ac_llvm_build.h"
  27
  28 #include "ac_exp_param.h"
  29 #include "ac_llvm_util.h"
  30 #include "ac_shader_util.h"
  31 #include "c11/threads.h"
  32 #include "shader_enums.h"
  33 #include "sid.h"
  34 #include "util/bitscan.h"
  35 #include "util/macros.h"
  36 #include "util/u_atomic.h"
  37 #include "util/u_math.h"
  38 #include <llvm-c/Core.h>
  39 #include <llvm/Config/llvm-config.h>
  40
  41 #include <assert.h>
  42 #include <stdio.h>
  43
  44 #define AC_LLVM_INITIAL_CF_DEPTH 4
  45
  46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
  47  */
  48 struct ac_llvm_flow {
  49    /* Loop exit or next part of if/else/endif. */
  50    LLVMBasicBlockRef next_block;
  51    LLVMBasicBlockRef loop_entry_block;
  52 };
  53
  54 /* Initialize module-independent parts of the context.
  55  *
  56  * The caller is responsible for initializing ctx::module and ctx::builder.
  57  */
  58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
  59                           enum chip_class chip_class, enum radeon_family family,
  60                           enum ac_float_mode float_mode, unsigned wave_size,
  61                           unsigned ballot_mask_bits)
  62 {
  63    ctx->context = LLVMContextCreate();
  64
  65    ctx->chip_class = chip_class;
  66    ctx->family = family;
  67    ctx->wave_size = wave_size;
  68    ctx->ballot_mask_bits = ballot_mask_bits;
  69    ctx->float_mode = float_mode;
  70    ctx->module =
  71       ac_create_module(wave_size == 32 ? compiler->tm_wave32 : compiler->tm, ctx->context);
  72    ctx->builder = ac_create_builder(ctx->context, float_mode);
  73
  74    ctx->voidt = LLVMVoidTypeInContext(ctx->context);
  75    ctx->i1 = LLVMInt1TypeInContext(ctx->context);
  76    ctx->i8 = LLVMInt8TypeInContext(ctx->context);
  77    ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
  78    ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
  79    ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
  80    ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
  81    ctx->intptr = ctx->i32;
  82    ctx->f16 = LLVMHalfTypeInContext(ctx->context);
  83    ctx->f32 = LLVMFloatTypeInContext(ctx->context);
  84    ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
  85    ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
  86    ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
  87    ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
  88    ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
  89    ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
  90    ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
  91    ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
  92    ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
  93    ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
  94    ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  95    ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  96    ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
  97    ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
  98
  99    ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
 100    ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
 101    ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
 102    ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
 103    ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
 104    ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
 105    ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
 106    ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
 107    ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
 108    ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
 109    ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
 110    ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
 111    ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
 112    ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 113    ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
 114    ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
 115
 116    ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
 117    ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
 118
 119    ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
 120
 121    ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
 122
 123    ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
 124
 125    ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
 126    ctx->flow = calloc(1, sizeof(*ctx->flow));
 127 }
 128
 129 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
 130 {
 131    free(ctx->flow->stack);
 132    free(ctx->flow);
 133    ctx->flow = NULL;
 134 }
 135
 136 int ac_get_llvm_num_components(LLVMValueRef value)
 137 {
 138    LLVMTypeRef type = LLVMTypeOf(value);
 139    unsigned num_components =
 140       LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
 141    return num_components;
 142 }
 143
 144 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
 145 {
 146    if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
 147       assert(index == 0);
 148       return value;
 149    }
 150
 151    return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
 152 }
 153
 154 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
 155 {
 156    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 157       type = LLVMGetElementType(type);
 158
 159    if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
 160       return LLVMGetIntTypeWidth(type);
 161
 162    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
 163       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
 164          return 32;
 165    }
 166
 167    if (type == ctx->f16)
 168       return 16;
 169    if (type == ctx->f32)
 170       return 32;
 171    if (type == ctx->f64)
 172       return 64;
 173
 174    unreachable("Unhandled type kind in get_elem_bits");
 175 }
 176
 177 unsigned ac_get_type_size(LLVMTypeRef type)
 178 {
 179    LLVMTypeKind kind = LLVMGetTypeKind(type);
 180
 181    switch (kind) {
 182    case LLVMIntegerTypeKind:
 183       return LLVMGetIntTypeWidth(type) / 8;
 184    case LLVMHalfTypeKind:
 185       return 2;
 186    case LLVMFloatTypeKind:
 187       return 4;
 188    case LLVMDoubleTypeKind:
 189       return 8;
 190    case LLVMPointerTypeKind:
 191       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
 192          return 4;
 193       return 8;
 194    case LLVMVectorTypeKind:
 195       return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
 196    case LLVMArrayTypeKind:
 197       return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
 198    default:
 199       assert(0);
 200       return 0;
 201    }
 202 }
 203
 204 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 205 {
 206    if (t == ctx->i8)
 207       return ctx->i8;
 208    else if (t == ctx->f16 || t == ctx->i16)
 209       return ctx->i16;
 210    else if (t == ctx->f32 || t == ctx->i32)
 211       return ctx->i32;
 212    else if (t == ctx->f64 || t == ctx->i64)
 213       return ctx->i64;
 214    else
 215       unreachable("Unhandled integer size");
 216 }
 217
 218 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 219 {
 220    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 221       LLVMTypeRef elem_type = LLVMGetElementType(t);
 222       return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
 223    }
 224    if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
 225       switch (LLVMGetPointerAddressSpace(t)) {
 226       case AC_ADDR_SPACE_GLOBAL:
 227          return ctx->i64;
 228       case AC_ADDR_SPACE_CONST_32BIT:
 229       case AC_ADDR_SPACE_LDS:
 230          return ctx->i32;
 231       default:
 232          unreachable("unhandled address space");
 233       }
 234    }
 235    return to_integer_type_scalar(ctx, t);
 236 }
 237
 238 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 239 {
 240    LLVMTypeRef type = LLVMTypeOf(v);
 241    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
 242       return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 243    }
 244    return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 245 }
 246
 247 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
 248 {
 249    LLVMTypeRef type = LLVMTypeOf(v);
 250    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
 251       return v;
 252    return ac_to_integer(ctx, v);
 253 }
 254
 255 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 256 {
 257    if (t == ctx->i8)
 258       return ctx->i8;
 259    else if (t == ctx->i16 || t == ctx->f16)
 260       return ctx->f16;
 261    else if (t == ctx->i32 || t == ctx->f32)
 262       return ctx->f32;
 263    else if (t == ctx->i64 || t == ctx->f64)
 264       return ctx->f64;
 265    else
 266       unreachable("Unhandled float size");
 267 }
 268
 269 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 270 {
 271    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 272       LLVMTypeRef elem_type = LLVMGetElementType(t);
 273       return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
 274    }
 275    return to_float_type_scalar(ctx, t);
 276 }
 277
 278 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
 279 {
 280    LLVMTypeRef type = LLVMTypeOf(v);
 281    return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
 282 }
 283
 284 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
 285                                 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
 286                                 unsigned attrib_mask)
 287 {
 288    LLVMValueRef function, call;
 289    bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 290
 291    function = LLVMGetNamedFunction(ctx->module, name);
 292    if (!function) {
 293       LLVMTypeRef param_types[32], function_type;
 294       unsigned i;
 295
 296       assert(param_count <= 32);
 297
 298       for (i = 0; i < param_count; ++i) {
 299          assert(params[i]);
 300          param_types[i] = LLVMTypeOf(params[i]);
 301       }
 302       function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
 303       function = LLVMAddFunction(ctx->module, name, function_type);
 304
 305       LLVMSetFunctionCallConv(function, LLVMCCallConv);
 306       LLVMSetLinkage(function, LLVMExternalLinkage);
 307
 308       if (!set_callsite_attrs)
 309          ac_add_func_attributes(ctx->context, function, attrib_mask);
 310    }
 311
 312    call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
 313    if (set_callsite_attrs)
 314       ac_add_func_attributes(ctx->context, call, attrib_mask);
 315    return call;
 316 }
 317
 318 /**
 319  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
 320  * intrinsic names).
 321  */
 322 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
 323 {
 324    LLVMTypeRef elem_type = type;
 325
 326    assert(bufsize >= 8);
 327
 328    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
 329       int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
 330       if (ret < 0) {
 331          char *type_name = LLVMPrintTypeToString(type);
 332          fprintf(stderr, "Error building type name for: %s\n", type_name);
 333          LLVMDisposeMessage(type_name);
 334          return;
 335       }
 336       elem_type = LLVMGetElementType(type);
 337       buf += ret;
 338       bufsize -= ret;
 339    }
 340    switch (LLVMGetTypeKind(elem_type)) {
 341    default:
 342       break;
 343    case LLVMIntegerTypeKind:
 344       snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 345       break;
 346    case LLVMHalfTypeKind:
 347       snprintf(buf, bufsize, "f16");
 348       break;
 349    case LLVMFloatTypeKind:
 350       snprintf(buf, bufsize, "f32");
 351       break;
 352    case LLVMDoubleTypeKind:
 353       snprintf(buf, bufsize, "f64");
 354       break;
 355    }
 356 }
 357
 358 /**
 359  * Helper function that builds an LLVM IR PHI node and immediately adds
 360  * incoming edges.
 361  */
 362 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
 363                           LLVMValueRef *values, LLVMBasicBlockRef *blocks)
 364 {
 365    LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
 366    LLVMAddIncoming(phi, values, blocks, count_incoming);
 367    return phi;
 368 }
 369
 370 void ac_build_s_barrier(struct ac_llvm_context *ctx)
 371 {
 372    ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
 373 }
 374
 375 /* Prevent optimizations (at least of memory accesses) across the current
 376  * point in the program by emitting empty inline assembly that is marked as
 377  * having side effects.
 378  *
 379  * Optionally, a value can be passed through the inline assembly to prevent
 380  * LLVM from hoisting calls to ReadNone functions.
 381  */
 382 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr)
 383 {
 384    static int counter = 0;
 385
 386    LLVMBuilderRef builder = ctx->builder;
 387    char code[16];
 388
 389    snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
 390
 391    if (!pvgpr) {
 392       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
 393       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
 394       LLVMBuildCall(builder, inlineasm, NULL, 0, "");
 395    } else {
 396       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
 397       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
 398       LLVMTypeRef type = LLVMTypeOf(*pvgpr);
 399       unsigned bitsize = ac_get_elem_bits(ctx, type);
 400       LLVMValueRef vgpr = *pvgpr;
 401       LLVMTypeRef vgpr_type;
 402       unsigned vgpr_size;
 403       LLVMValueRef vgpr0;
 404
 405       if (bitsize < 32)
 406          vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
 407
 408       vgpr_type = LLVMTypeOf(vgpr);
 409       vgpr_size = ac_get_type_size(vgpr_type);
 410
 411       assert(vgpr_size % 4 == 0);
 412
 413       vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
 414       vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
 415       vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
 416       vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
 417       vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
 418
 419       if (bitsize < 32)
 420          vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
 421
 422       *pvgpr = vgpr;
 423    }
 424 }
 425
 426 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
 427 {
 428    const char *name =
 429       scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : "llvm.amdgcn.s.memtime";
 430    LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
 431    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
 432 }
 433
 434 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
 435 {
 436    const char *name;
 437
 438    if (LLVM_VERSION_MAJOR >= 9) {
 439       if (ctx->wave_size == 64)
 440          name = "llvm.amdgcn.icmp.i64.i32";
 441       else
 442          name = "llvm.amdgcn.icmp.i32.i32";
 443    } else {
 444       name = "llvm.amdgcn.icmp.i32";
 445    }
 446    LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
 447
 448    /* We currently have no other way to prevent LLVM from lifting the icmp
 449     * calls to a dominating basic block.
 450     */
 451    ac_build_optimization_barrier(ctx, &args[0]);
 452
 453    args[0] = ac_to_integer(ctx, args[0]);
 454
 455    return ac_build_intrinsic(
 456       ctx, name, ctx->iN_wavemask, args, 3,
 457       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
 458 }
 459
 460 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
 461 {
 462    const char *name;
 463
 464    if (LLVM_VERSION_MAJOR >= 9) {
 465       if (ctx->wave_size == 64)
 466          name = "llvm.amdgcn.icmp.i64.i1";
 467       else
 468          name = "llvm.amdgcn.icmp.i32.i1";
 469    } else {
 470       name = "llvm.amdgcn.icmp.i1";
 471    }
 472    LLVMValueRef args[3] = {
 473       value,
 474       ctx->i1false,
 475       LLVMConstInt(ctx->i32, LLVMIntNE, 0),
 476    };
 477
 478    return ac_build_intrinsic(
 479       ctx, name, ctx->iN_wavemask, args, 3,
 480       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
 481 }
 482
 483 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
 484 {
 485    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 486    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 487    return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
 488 }
 489
 490 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
 491 {
 492    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 493    return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
 494                         "");
 495 }
 496
 497 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
 498 {
 499    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 500    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 501
 502    LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
 503    LLVMValueRef none =
 504       LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
 505    return LLVMBuildOr(ctx->builder, all, none, "");
 506 }
 507
 508 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
 509                                             unsigned value_count, unsigned component)
 510 {
 511    LLVMValueRef vec = NULL;
 512
 513    if (value_count == 1) {
 514       return values[component];
 515    } else if (!value_count)
 516       unreachable("value_count is 0");
 517
 518    for (unsigned i = component; i < value_count + component; i++) {
 519       LLVMValueRef value = values[i];
 520
 521       if (i == component)
 522          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
 523       LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
 524       vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
 525    }
 526    return vec;
 527 }
 528
 529 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
 530                                              unsigned value_count, unsigned value_stride, bool load,
 531                                              bool always_vector)
 532 {
 533    LLVMBuilderRef builder = ctx->builder;
 534    LLVMValueRef vec = NULL;
 535    unsigned i;
 536
 537    if (value_count == 1 && !always_vector) {
 538       if (load)
 539          return LLVMBuildLoad(builder, values[0], "");
 540       return values[0];
 541    } else if (!value_count)
 542       unreachable("value_count is 0");
 543
 544    for (i = 0; i < value_count; i++) {
 545       LLVMValueRef value = values[i * value_stride];
 546       if (load)
 547          value = LLVMBuildLoad(builder, value, "");
 548
 549       if (!i)
 550          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
 551       LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
 552       vec = LLVMBuildInsertElement(builder, vec, value, index, "");
 553    }
 554    return vec;
 555 }
 556
 557 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
 558                                     unsigned value_count)
 559 {
 560    return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 561 }
 562
 563 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
 564  * channels with undef. Extract at most src_channels components from the input.
 565  */
 566 static LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
 567                                     unsigned src_channels, unsigned dst_channels)
 568 {
 569    LLVMTypeRef elemtype;
 570    LLVMValueRef chan[dst_channels];
 571
 572    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
 573       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
 574
 575       if (src_channels == dst_channels && vec_size == dst_channels)
 576          return value;
 577
 578       src_channels = MIN2(src_channels, vec_size);
 579
 580       for (unsigned i = 0; i < src_channels; i++)
 581          chan[i] = ac_llvm_extract_elem(ctx, value, i);
 582
 583       elemtype = LLVMGetElementType(LLVMTypeOf(value));
 584    } else {
 585       if (src_channels) {
 586          assert(src_channels == 1);
 587          chan[0] = value;
 588       }
 589       elemtype = LLVMTypeOf(value);
 590    }
 591
 592    for (unsigned i = src_channels; i < dst_channels; i++)
 593       chan[i] = LLVMGetUndef(elemtype);
 594
 595    return ac_build_gather_values(ctx, chan, dst_channels);
 596 }
 597
 598 /* Extract components [start, start + channels) from a vector.
 599  */
 600 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
 601                                    unsigned channels)
 602 {
 603    LLVMValueRef chan[channels];
 604
 605    for (unsigned i = 0; i < channels; i++)
 606       chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
 607
 608    return ac_build_gather_values(ctx, chan, channels);
 609 }
 610
 611 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
 612  * with undef. Extract at most num_channels components from the input.
 613  */
 614 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
 615                                      unsigned num_channels)
 616 {
 617    return ac_build_expand(ctx, value, num_channels, 4);
 618 }
 619
 620 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
 621 {
 622    unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
 623    const char *name;
 624
 625    if (type_size == 2)
 626       name = "llvm.rint.f16";
 627    else if (type_size == 4)
 628       name = "llvm.rint.f32";
 629    else
 630       name = "llvm.rint.f64";
 631
 632    return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
 633 }
 634
 635 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
 636 {
 637    unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
 638    const char *name;
 639
 640    /* For doubles, we need precise division to pass GLCTS. */
 641    if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
 642       return LLVMBuildFDiv(ctx->builder, num, den, "");
 643
 644    if (type_size == 2)
 645       name = "llvm.amdgcn.rcp.f16";
 646    else if (type_size == 4)
 647       name = "llvm.amdgcn.rcp.f32";
 648    else
 649       name = "llvm.amdgcn.rcp.f64";
 650
 651    LLVMValueRef rcp =
 652       ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
 653
 654    return LLVMBuildFMul(ctx->builder, num, rcp, "");
 655 }
 656
 657 /* See fast_idiv_by_const.h. */
 658 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
 659 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
 660                                 LLVMValueRef multiplier, LLVMValueRef pre_shift,
 661                                 LLVMValueRef post_shift, LLVMValueRef increment)
 662 {
 663    LLVMBuilderRef builder = ctx->builder;
 664
 665    num = LLVMBuildLShr(builder, num, pre_shift, "");
 666    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
 667                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 668    num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
 669    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 670    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 671    return LLVMBuildLShr(builder, num, post_shift, "");
 672 }
 673
 674 /* See fast_idiv_by_const.h. */
 675 /* If num != UINT_MAX, this more efficient version can be used. */
 676 /* Set: increment = util_fast_udiv_info::increment; */
 677 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
 678                                     LLVMValueRef multiplier, LLVMValueRef pre_shift,
 679                                     LLVMValueRef post_shift, LLVMValueRef increment)
 680 {
 681    LLVMBuilderRef builder = ctx->builder;
 682
 683    num = LLVMBuildLShr(builder, num, pre_shift, "");
 684    num = LLVMBuildNUWAdd(builder, num, increment, "");
 685    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
 686                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 687    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 688    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 689    return LLVMBuildLShr(builder, num, post_shift, "");
 690 }
 691
 692 /* See fast_idiv_by_const.h. */
 693 /* Both operands must fit in 31 bits and the divisor must not be 1. */
 694 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
 695                                               LLVMValueRef multiplier, LLVMValueRef post_shift)
 696 {
 697    LLVMBuilderRef builder = ctx->builder;
 698
 699    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
 700                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 701    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 702    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 703    return LLVMBuildLShr(builder, num, post_shift, "");
 704 }
 705
 706 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
 707  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
 708  * already multiplied by two. id is the cube face number.
 709  */
 710 struct cube_selection_coords {
 711    LLVMValueRef stc[2];
 712    LLVMValueRef ma;
 713    LLVMValueRef id;
 714 };
 715
 716 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
 717                                  struct cube_selection_coords *out)
 718 {
 719    LLVMTypeRef f32 = ctx->f32;
 720
 721    out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
 722    out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
 723    out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
 724    out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
 725 }
 726
 727 /**
 728  * Build a manual selection sequence for cube face sc/tc coordinates and
 729  * major axis vector (multiplied by 2 for consistency) for the given
 730  * vec3 \p coords, for the face implied by \p selcoords.
 731  *
 732  * For the major axis, we always adjust the sign to be in the direction of
 733  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
 734  * the selcoords major axis.
 735  */
 736 static void build_cube_select(struct ac_llvm_context *ctx,
 737                               const struct cube_selection_coords *selcoords,
 738                               const LLVMValueRef *coords, LLVMValueRef *out_st,
 739                               LLVMValueRef *out_ma)
 740 {
 741    LLVMBuilderRef builder = ctx->builder;
 742    LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
 743    LLVMValueRef is_ma_positive;
 744    LLVMValueRef sgn_ma;
 745    LLVMValueRef is_ma_z, is_not_ma_z;
 746    LLVMValueRef is_ma_y;
 747    LLVMValueRef is_ma_x;
 748    LLVMValueRef sgn;
 749    LLVMValueRef tmp;
 750
 751    is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
 752    sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
 753                             LLVMConstReal(f32, -1.0), "");
 754
 755    is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
 756    is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
 757    is_ma_y = LLVMBuildAnd(
 758       builder, is_not_ma_z,
 759       LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
 760    is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 761
 762    /* Select sc */
 763    tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
 764    sgn = LLVMBuildSelect(
 765       builder, is_ma_y, LLVMConstReal(f32, 1.0),
 766       LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 767    out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 768
 769    /* Select tc */
 770    tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
 771    sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
 772    out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 773
 774    /* Select ma */
 775    tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
 776                          LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
 777    tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
 778    *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
 779 }
 780
 781 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
 782                             LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
 783 {
 784
 785    LLVMBuilderRef builder = ctx->builder;
 786    struct cube_selection_coords selcoords;
 787    LLVMValueRef coords[3];
 788    LLVMValueRef invma;
 789
 790    if (is_array && !is_lod) {
 791       LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
 792
 793       /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
 794        *
 795        *    "For Array forms, the array layer used will be
 796        *
 797        *       max(0, min(d−1, floor(layer+0.5)))
 798        *
 799        *     where d is the depth of the texture array and layer
 800        *     comes from the component indicated in the tables below.
 801        *     Workaroudn for an issue where the layer is taken from a
 802        *     helper invocation which happens to fall on a different
 803        *     layer due to extrapolation."
 804        *
 805        * GFX8 and earlier attempt to implement this in hardware by
 806        * clamping the value of coords[2] = (8 * layer) + face.
 807        * Unfortunately, this means that the we end up with the wrong
 808        * face when clamping occurs.
 809        *
 810        * Clamp the layer earlier to work around the issue.
 811        */
 812       if (ctx->chip_class <= GFX8) {
 813          LLVMValueRef ge0;
 814          ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
 815          tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
 816       }
 817
 818       coords_arg[3] = tmp;
 819    }
 820
 821    build_cube_intrinsic(ctx, coords_arg, &selcoords);
 822
 823    invma =
 824       ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
 825    invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
 826
 827    for (int i = 0; i < 2; ++i)
 828       coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
 829
 830    coords[2] = selcoords.id;
 831
 832    if (is_deriv && derivs_arg) {
 833       LLVMValueRef derivs[4];
 834       int axis;
 835
 836       /* Convert cube derivatives to 2D derivatives. */
 837       for (axis = 0; axis < 2; axis++) {
 838          LLVMValueRef deriv_st[2];
 839          LLVMValueRef deriv_ma;
 840
 841          /* Transform the derivative alongside the texture
 842           * coordinate. Mathematically, the correct formula is
 843           * as follows. Assume we're projecting onto the +Z face
 844           * and denote by dx/dh the derivative of the (original)
 845           * X texture coordinate with respect to horizontal
 846           * window coordinates. The projection onto the +Z face
 847           * plane is:
 848           *
 849           *   f(x,z) = x/z
 850           *
 851           * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
 852           *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
 853           *
 854           * This motivatives the implementation below.
 855           *
 856           * Whether this actually gives the expected results for
 857           * apps that might feed in derivatives obtained via
 858           * finite differences is anyone's guess. The OpenGL spec
 859           * seems awfully quiet about how textureGrad for cube
 860           * maps should be handled.
 861           */
 862          build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
 863
 864          deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
 865
 866          for (int i = 0; i < 2; ++i)
 867             derivs[axis * 2 + i] =
 868                LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
 869                              LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
 870       }
 871
 872       memcpy(derivs_arg, derivs, sizeof(derivs));
 873    }
 874
 875    /* Shift the texture coordinate. This must be applied after the
 876     * derivative calculation.
 877     */
 878    for (int i = 0; i < 2; ++i)
 879       coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
 880
 881    if (is_array) {
 882       /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 883       /* coords_arg.w component - array_index for cube arrays */
 884       coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
 885    }
 886
 887    memcpy(coords_arg, coords, sizeof(coords));
 888 }
 889
 890 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
 891                                 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
 892                                 LLVMValueRef j)
 893 {
 894    LLVMValueRef args[5];
 895    LLVMValueRef p1;
 896
 897    args[0] = i;
 898    args[1] = llvm_chan;
 899    args[2] = attr_number;
 900    args[3] = params;
 901
 902    p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 903
 904    args[0] = p1;
 905    args[1] = j;
 906    args[2] = llvm_chan;
 907    args[3] = attr_number;
 908    args[4] = params;
 909
 910    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,
 911                              AC_FUNC_ATTR_READNONE);
 912 }
 913
 914 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
 915                                     LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
 916                                     LLVMValueRef j)
 917 {
 918    LLVMValueRef args[6];
 919    LLVMValueRef p1;
 920
 921    args[0] = i;
 922    args[1] = llvm_chan;
 923    args[2] = attr_number;
 924    args[3] = ctx->i1false;
 925    args[4] = params;
 926
 927    p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
 928                            AC_FUNC_ATTR_READNONE);
 929
 930    args[0] = p1;
 931    args[1] = j;
 932    args[2] = llvm_chan;
 933    args[3] = attr_number;
 934    args[4] = ctx->i1false;
 935    args[5] = params;
 936
 937    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
 938                              AC_FUNC_ATTR_READNONE);
 939 }
 940
 941 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
 942                                     LLVMValueRef llvm_chan, LLVMValueRef attr_number,
 943                                     LLVMValueRef params)
 944 {
 945    LLVMValueRef args[4];
 946
 947    args[0] = parameter;
 948    args[1] = llvm_chan;
 949    args[2] = attr_number;
 950    args[3] = params;
 951
 952    return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
 953                              AC_FUNC_ATTR_READNONE);
 954 }
 955
 956 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
 957                               LLVMValueRef index)
 958 {
 959    return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
 960 }
 961
 962 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
 963 {
 964    LLVMValueRef indices[2] = {
 965       ctx->i32_0,
 966       index,
 967    };
 968    return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
 969 }
 970
 971 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
 972 {
 973    return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
 974                                LLVMTypeOf(ptr), "");
 975 }
 976
 977 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
 978                             LLVMValueRef value)
 979 {
 980    LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
 981 }
 982
 983 /**
 984  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
 985  * It's equivalent to doing a load from &base_ptr[index].
 986  *
 987  * \param base_ptr  Where the array starts.
 988  * \param index     The element index into the array.
 989  * \param uniform   Whether the base_ptr and index can be assumed to be
 990  *                  dynamically uniform (i.e. load to an SGPR)
 991  * \param invariant Whether the load is invariant (no other opcodes affect it)
 992  * \param no_unsigned_wraparound
 993  *    For all possible re-associations and re-distributions of an expression
 994  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
 995  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
 996  *    does not result in an unsigned integer wraparound. This is used for
 997  *    optimal code generation of 32-bit pointer arithmetic.
 998  *
 999  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1000  *    integer wraparound can't be an imm offset in s_load_dword, because
1001  *    the instruction performs "addr + offset" in 64 bits.
1002  *
1003  *    Expected usage for bindless textures by chaining GEPs:
1004  *      // possible unsigned wraparound, don't use InBounds:
1005  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1006  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1007  *
1008  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1009  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1010  */
1011 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1012                                          LLVMValueRef index, bool uniform, bool invariant,
1013                                          bool no_unsigned_wraparound)
1014 {
1015    LLVMValueRef pointer, result;
1016
1017    if (no_unsigned_wraparound &&
1018        LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1019       pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1020    else
1021       pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1022
1023    if (uniform)
1024       LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1025    result = LLVMBuildLoad(ctx->builder, pointer, "");
1026    if (invariant)
1027       LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1028    return result;
1029 }
1030
1031 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1032 {
1033    return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1034 }
1035
1036 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1037                                      LLVMValueRef index)
1038 {
1039    return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1040 }
1041
1042 /* This assumes that there is no unsigned integer wraparound during the address
1043  * computation, excluding all GEPs within base_ptr. */
1044 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1045                                    LLVMValueRef index)
1046 {
1047    return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1048 }
1049
1050 /* See ac_build_load_custom() documentation. */
1051 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1052                                                    LLVMValueRef base_ptr, LLVMValueRef index)
1053 {
1054    return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1055 }
1056
1057 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1058 {
1059    return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1060 }
1061
1062 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1063                                          LLVMValueRef data, LLVMValueRef vindex,
1064                                          LLVMValueRef voffset, LLVMValueRef soffset,
1065                                          unsigned cache_policy, bool use_format, bool structurized)
1066 {
1067    LLVMValueRef args[6];
1068    int idx = 0;
1069    args[idx++] = data;
1070    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1071    if (structurized)
1072       args[idx++] = vindex ? vindex : ctx->i32_0;
1073    args[idx++] = voffset ? voffset : ctx->i32_0;
1074    args[idx++] = soffset ? soffset : ctx->i32_0;
1075    args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1076    const char *indexing_kind = structurized ? "struct" : "raw";
1077    char name[256], type_name[8];
1078
1079    ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1080
1081    if (use_format) {
1082       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1083                type_name);
1084    } else {
1085       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1086    }
1087
1088    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1089 }
1090
1091 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1092                                   LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1093 {
1094    ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);
1095 }
1096
1097 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1098  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1099  * or v4i32 (num_channels=3,4).
1100  */
1101 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1102                                  unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,
1103                                  unsigned inst_offset, unsigned cache_policy)
1104 {
1105    /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1106     * intrinsics. */
1107    if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1108       LLVMValueRef v[3], v01;
1109
1110       for (int i = 0; i < 3; i++) {
1111          v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1112       }
1113       v01 = ac_build_gather_values(ctx, v, 2);
1114
1115       ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);
1116       ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,
1117                                   cache_policy);
1118       return;
1119    }
1120
1121    /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1122     * (voffset is swizzled, but soffset isn't swizzled).
1123     * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1124     */
1125    if (!(cache_policy & ac_swizzled)) {
1126       LLVMValueRef offset = soffset;
1127
1128       if (inst_offset)
1129          offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");
1130
1131       ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,
1132                                    cache_policy, false, false);
1133       return;
1134    }
1135
1136    static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
1137                                     V_008F0C_BUF_DATA_FORMAT_32_32_32,
1138                                     V_008F0C_BUF_DATA_FORMAT_32_32_32_32};
1139    unsigned dfmt = dfmts[num_channels - 1];
1140    unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1141    LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1142
1143    ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,
1144                               nfmt, cache_policy);
1145 }
1146
1147 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1148                                                 LLVMValueRef vindex, LLVMValueRef voffset,
1149                                                 LLVMValueRef soffset, unsigned num_channels,
1150                                                 LLVMTypeRef channel_type, unsigned cache_policy,
1151                                                 bool can_speculate, bool use_format,
1152                                                 bool structurized)
1153 {
1154    LLVMValueRef args[5];
1155    int idx = 0;
1156    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1157    if (structurized)
1158       args[idx++] = vindex ? vindex : ctx->i32_0;
1159    args[idx++] = voffset ? voffset : ctx->i32_0;
1160    args[idx++] = soffset ? soffset : ctx->i32_0;
1161    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1162    unsigned func =
1163       !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1164    const char *indexing_kind = structurized ? "struct" : "raw";
1165    char name[256], type_name[8];
1166
1167    /* D16 is only supported on gfx8+ */
1168    assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1169           ctx->chip_class >= GFX8);
1170
1171    LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1172    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1173
1174    if (use_format) {
1175       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1176                type_name);
1177    } else {
1178       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1179    }
1180
1181    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1182 }
1183
1184 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1185                                   LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1186                                   unsigned inst_offset, unsigned cache_policy, bool can_speculate,
1187                                   bool allow_smem)
1188 {
1189    LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1190    if (voffset)
1191       offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1192    if (soffset)
1193       offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1194
1195    if (allow_smem && !(cache_policy & ac_slc) &&
1196        (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1197       assert(vindex == NULL);
1198
1199       LLVMValueRef result[8];
1200
1201       for (int i = 0; i < num_channels; i++) {
1202          if (i) {
1203             offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1204          }
1205          LLVMValueRef args[3] = {
1206             rsrc,
1207             offset,
1208             LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1209          };
1210          result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1211                                         AC_FUNC_ATTR_READNONE);
1212       }
1213       if (num_channels == 1)
1214          return result[0];
1215
1216       if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1217          result[num_channels++] = LLVMGetUndef(ctx->f32);
1218       return ac_build_gather_values(ctx, result, num_channels);
1219    }
1220
1221    return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels, ctx->f32,
1222                                       cache_policy, can_speculate, false, false);
1223 }
1224
1225 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1226                                          LLVMValueRef vindex, LLVMValueRef voffset,
1227                                          unsigned num_channels, unsigned cache_policy,
1228                                          bool can_speculate, bool d16)
1229 {
1230    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1231                                       d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1232                                       true);
1233 }
1234
1235 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1236                                           LLVMValueRef vindex, LLVMValueRef voffset,
1237                                           LLVMValueRef soffset, LLVMValueRef immoffset,
1238                                           unsigned num_channels, unsigned dfmt, unsigned nfmt,
1239                                           unsigned cache_policy, bool can_speculate,
1240                                           bool structurized)
1241 {
1242    voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1243
1244    LLVMValueRef args[6];
1245    int idx = 0;
1246    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1247    if (structurized)
1248       args[idx++] = vindex ? vindex : ctx->i32_0;
1249    args[idx++] = voffset ? voffset : ctx->i32_0;
1250    args[idx++] = soffset ? soffset : ctx->i32_0;
1251    args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1252    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1253    unsigned func =
1254       !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1255    const char *indexing_kind = structurized ? "struct" : "raw";
1256    char name[256], type_name[8];
1257
1258    LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1259    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1260
1261    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1262
1263    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1264 }
1265
1266 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1267                                           LLVMValueRef vindex, LLVMValueRef voffset,
1268                                           LLVMValueRef soffset, LLVMValueRef immoffset,
1269                                           unsigned num_channels, unsigned dfmt, unsigned nfmt,
1270                                           unsigned cache_policy, bool can_speculate)
1271 {
1272    return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1273                                 nfmt, cache_policy, can_speculate, true);
1274 }
1275
1276 LLVMValueRef ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1277                                        LLVMValueRef voffset, LLVMValueRef soffset,
1278                                        LLVMValueRef immoffset, unsigned num_channels, unsigned dfmt,
1279                                        unsigned nfmt, unsigned cache_policy, bool can_speculate)
1280 {
1281    return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1282                                 nfmt, cache_policy, can_speculate, false);
1283 }
1284
1285 LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1286                                          LLVMValueRef voffset, LLVMValueRef soffset,
1287                                          LLVMValueRef immoffset, unsigned cache_policy)
1288 {
1289    LLVMValueRef res;
1290
1291    if (LLVM_VERSION_MAJOR >= 9) {
1292       voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1293
1294       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1295       res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1296                                         cache_policy, false, false, false);
1297    } else {
1298       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1299       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1300
1301       res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1302                                       cache_policy, false);
1303
1304       res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1305    }
1306
1307    return res;
1308 }
1309
1310 LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1311                                         LLVMValueRef voffset, LLVMValueRef soffset,
1312                                         LLVMValueRef immoffset, unsigned cache_policy)
1313 {
1314    LLVMValueRef res;
1315
1316    if (LLVM_VERSION_MAJOR >= 9) {
1317       voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1318
1319       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1320       res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1321                                         false, false, false);
1322    } else {
1323       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1324       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1325
1326       res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1327                                       cache_policy, false);
1328
1329       res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1330    }
1331
1332    return res;
1333 }
1334
1335 /**
1336  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1337  *
1338  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1339  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1340  */
1341 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1342                                     unsigned exp_bits, unsigned mant_bits)
1343 {
1344    assert(LLVMTypeOf(src) == ctx->i32);
1345
1346    LLVMValueRef tmp;
1347    LLVMValueRef mantissa;
1348    mantissa =
1349       LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1350
1351    /* Converting normal numbers is just a shift + correcting the exponent bias */
1352    unsigned normal_shift = 23 - mant_bits;
1353    unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1354    LLVMValueRef shifted, normal;
1355
1356    shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1357    normal =
1358       LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1359
1360    /* Converting nan/inf numbers is the same, but with a different exponent update */
1361    LLVMValueRef naninf;
1362    naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1363
1364    /* Converting denormals is the complex case: determine the leading zeros of the
1365     * mantissa to obtain the correct shift for the mantissa and exponent correction.
1366     */
1367    LLVMValueRef denormal;
1368    LLVMValueRef params[2] = {
1369       mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1370    };
1371    LLVMValueRef ctlz =
1372       ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1373
1374    /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1375    tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1376    denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1377
1378    unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1379    tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1380    tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1381    denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1382
1383    /* Select the final result. */
1384    LLVMValueRef result;
1385
1386    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1387                        LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1388    result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1389
1390    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, LLVMConstInt(ctx->i32, 1 << mant_bits, false),
1391                        "");
1392    result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1393
1394    tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1395    result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1396
1397    return ac_to_float(ctx, result);
1398 }
1399
1400 /**
1401  * Generate a fully general open coded buffer format fetch with all required
1402  * fixups suitable for vertex fetch, using non-format buffer loads.
1403  *
1404  * Some combinations of argument values have special interpretations:
1405  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1406  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1407  *
1408  * \param log_size log(size of channel in bytes)
1409  * \param num_channels number of channels (1 to 4)
1410  * \param format AC_FETCH_FORMAT_xxx value
1411  * \param reverse whether XYZ channels are reversed
1412  * \param known_aligned whether the source is known to be aligned to hardware's
1413  *                      effective element size for loading the given format
1414  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1415  * \param rsrc buffer resource descriptor
1416  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1417  */
1418 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1419                                             unsigned num_channels, unsigned format, bool reverse,
1420                                             bool known_aligned, LLVMValueRef rsrc,
1421                                             LLVMValueRef vindex, LLVMValueRef voffset,
1422                                             LLVMValueRef soffset, unsigned cache_policy,
1423                                             bool can_speculate)
1424 {
1425    LLVMValueRef tmp;
1426    unsigned load_log_size = log_size;
1427    unsigned load_num_channels = num_channels;
1428    if (log_size == 3) {
1429       load_log_size = 2;
1430       if (format == AC_FETCH_FORMAT_FLOAT) {
1431          load_num_channels = 2 * num_channels;
1432       } else {
1433          load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1434       }
1435    }
1436
1437    int log_recombine = 0;
1438    if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {
1439       /* Avoid alignment restrictions by loading one byte at a time. */
1440       load_num_channels <<= load_log_size;
1441       log_recombine = load_log_size;
1442       load_log_size = 0;
1443    } else if (load_num_channels == 2 || load_num_channels == 4) {
1444       log_recombine = -util_logbase2(load_num_channels);
1445       load_num_channels = 1;
1446       load_log_size += -log_recombine;
1447    }
1448
1449    assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9);
1450
1451    LLVMValueRef loads[32]; /* up to 32 bytes */
1452    for (unsigned i = 0; i < load_num_channels; ++i) {
1453       tmp =
1454          LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1455       LLVMTypeRef channel_type =
1456          load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1457       unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1458       loads[i] =
1459          ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1460                                      cache_policy, can_speculate, false, true);
1461       if (load_log_size >= 2)
1462          loads[i] = ac_to_integer(ctx, loads[i]);
1463    }
1464
1465    if (log_recombine > 0) {
1466       /* Recombine bytes if necessary (GFX6 only) */
1467       LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1468
1469       for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1470          LLVMValueRef accum = NULL;
1471          for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1472             tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1473             if (i == 0) {
1474                accum = tmp;
1475             } else {
1476                tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1477                accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1478             }
1479          }
1480          loads[dst] = accum;
1481       }
1482    } else if (log_recombine < 0) {
1483       /* Split vectors of dwords */
1484       if (load_log_size > 2) {
1485          assert(load_num_channels == 1);
1486          LLVMValueRef loaded = loads[0];
1487          unsigned log_split = load_log_size - 2;
1488          log_recombine += log_split;
1489          load_num_channels = 1 << log_split;
1490          load_log_size = 2;
1491          for (unsigned i = 0; i < load_num_channels; ++i) {
1492             tmp = LLVMConstInt(ctx->i32, i, false);
1493             loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1494          }
1495       }
1496
1497       /* Further split dwords and shorts if required */
1498       if (log_recombine < 0) {
1499          for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1500               --src) {
1501             unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1502             LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1503             LLVMValueRef loaded = loads[src - 1];
1504             LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1505             for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1506                tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1507                tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1508                loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1509             }
1510          }
1511       }
1512    }
1513
1514    if (log_size == 3) {
1515       if (format == AC_FETCH_FORMAT_FLOAT) {
1516          for (unsigned i = 0; i < num_channels; ++i) {
1517             tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1518             loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1519          }
1520       } else if (format == AC_FETCH_FORMAT_FIXED) {
1521          /* 10_11_11_FLOAT */
1522          LLVMValueRef data = loads[0];
1523          LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1524          LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1525          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1526          LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1527          LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1528
1529          loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1530          loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1531          loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1532
1533          num_channels = 3;
1534          log_size = 2;
1535          format = AC_FETCH_FORMAT_FLOAT;
1536       } else {
1537          /* 2_10_10_10 data formats */
1538          LLVMValueRef data = loads[0];
1539          LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1540          LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1541          loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1542          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1543          loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1544          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1545          loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1546          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1547          loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1548
1549          num_channels = 4;
1550       }
1551    }
1552
1553    if (format == AC_FETCH_FORMAT_FLOAT) {
1554       if (log_size != 2) {
1555          for (unsigned chan = 0; chan < num_channels; ++chan) {
1556             tmp = ac_to_float(ctx, loads[chan]);
1557             if (log_size == 3)
1558                tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1559             else if (log_size == 1)
1560                tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1561             loads[chan] = ac_to_integer(ctx, tmp);
1562          }
1563       }
1564    } else if (format == AC_FETCH_FORMAT_UINT) {
1565       if (log_size != 2) {
1566          for (unsigned chan = 0; chan < num_channels; ++chan)
1567             loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1568       }
1569    } else if (format == AC_FETCH_FORMAT_SINT) {
1570       if (log_size != 2) {
1571          for (unsigned chan = 0; chan < num_channels; ++chan)
1572             loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1573       }
1574    } else {
1575       bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1576                     format == AC_FETCH_FORMAT_UINT;
1577
1578       for (unsigned chan = 0; chan < num_channels; ++chan) {
1579          if (unsign) {
1580             tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1581          } else {
1582             tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1583          }
1584
1585          LLVMValueRef scale = NULL;
1586          if (format == AC_FETCH_FORMAT_FIXED) {
1587             assert(log_size == 2);
1588             scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1589          } else if (format == AC_FETCH_FORMAT_UNORM) {
1590             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1591             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1592          } else if (format == AC_FETCH_FORMAT_SNORM) {
1593             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1594             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1595          }
1596          if (scale)
1597             tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1598
1599          if (format == AC_FETCH_FORMAT_SNORM) {
1600             /* Clamp to [-1, 1] */
1601             LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1602             LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1603             tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1604          }
1605
1606          loads[chan] = ac_to_integer(ctx, tmp);
1607       }
1608    }
1609
1610    while (num_channels < 4) {
1611       if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1612          loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1613       } else {
1614          loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1615       }
1616       num_channels++;
1617    }
1618
1619    if (reverse) {
1620       tmp = loads[0];
1621       loads[0] = loads[2];
1622       loads[2] = tmp;
1623    }
1624
1625    return ac_build_gather_values(ctx, loads, 4);
1626 }
1627
1628 static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1629                                    LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1630                                    LLVMValueRef soffset, LLVMValueRef immoffset,
1631                                    unsigned num_channels, unsigned dfmt, unsigned nfmt,
1632                                    unsigned cache_policy, bool structurized)
1633 {
1634    voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");
1635
1636    LLVMValueRef args[7];
1637    int idx = 0;
1638    args[idx++] = vdata;
1639    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1640    if (structurized)
1641       args[idx++] = vindex ? vindex : ctx->i32_0;
1642    args[idx++] = voffset ? voffset : ctx->i32_0;
1643    args[idx++] = soffset ? soffset : ctx->i32_0;
1644    args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1645    args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1646    unsigned func =
1647       !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1648    const char *indexing_kind = structurized ? "struct" : "raw";
1649    char name[256], type_name[8];
1650
1651    LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1652    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1653
1654    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);
1655
1656    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1657 }
1658
1659 void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1660                                    LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1661                                    LLVMValueRef soffset, LLVMValueRef immoffset,
1662                                    unsigned num_channels, unsigned dfmt, unsigned nfmt,
1663                                    unsigned cache_policy)
1664 {
1665    ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1666                           nfmt, cache_policy, true);
1667 }
1668
1669 void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1670                                 LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,
1671                                 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1672                                 unsigned cache_policy)
1673 {
1674    ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1675                           nfmt, cache_policy, false);
1676 }
1677
1678 void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1679                                   LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1680                                   unsigned cache_policy)
1681 {
1682    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1683
1684    if (LLVM_VERSION_MAJOR >= 9) {
1685       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1686       ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1687                                    false);
1688    } else {
1689       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1690       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1691
1692       vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1693
1694       ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1695                                  cache_policy);
1696    }
1697 }
1698
1699 void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1700                                  LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1701 {
1702    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1703
1704    if (LLVM_VERSION_MAJOR >= 9) {
1705       /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1706       ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1707                                    false);
1708    } else {
1709       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1710       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1711
1712       vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1713
1714       ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1715                                  cache_policy);
1716    }
1717 }
1718 /**
1719  * Set range metadata on an instruction.  This can only be used on load and
1720  * call instructions.  If you know an instruction can only produce the values
1721  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1722  * \p lo is the minimum value inclusive.
1723  * \p hi is the maximum value exclusive.
1724  */
1725 static void set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1726                                unsigned hi)
1727 {
1728    LLVMValueRef range_md, md_args[2];
1729    LLVMTypeRef type = LLVMTypeOf(value);
1730    LLVMContextRef context = LLVMGetTypeContext(type);
1731
1732    md_args[0] = LLVMConstInt(type, lo, false);
1733    md_args[1] = LLVMConstInt(type, hi, false);
1734    range_md = LLVMMDNodeInContext(context, md_args, 2);
1735    LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1736 }
1737
1738 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1739 {
1740    LLVMValueRef tid;
1741
1742    LLVMValueRef tid_args[2];
1743    tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1744    tid_args[1] = ctx->i32_0;
1745    tid_args[1] =
1746       ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, tid_args, 2, AC_FUNC_ATTR_READNONE);
1747
1748    if (ctx->wave_size == 32) {
1749       tid = tid_args[1];
1750    } else {
1751       tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, tid_args, 2,
1752                                AC_FUNC_ATTR_READNONE);
1753    }
1754    set_range_metadata(ctx, tid, 0, ctx->wave_size);
1755    return tid;
1756 }
1757
1758 /*
1759  * AMD GCN implements derivatives using the local data store (LDS)
1760  * All writes to the LDS happen in all executing threads at
1761  * the same time. TID is the Thread ID for the current
1762  * thread and is a value between 0 and 63, representing
1763  * the thread's position in the wavefront.
1764  *
1765  * For the pixel shader threads are grouped into quads of four pixels.
1766  * The TIDs of the pixels of a quad are:
1767  *
1768  *  +------+------+
1769  *  |4n + 0|4n + 1|
1770  *  +------+------+
1771  *  |4n + 2|4n + 3|
1772  *  +------+------+
1773  *
1774  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1775  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1776  * the current pixel's column, and masking with 0xfffffffe yields the TID
1777  * of the left pixel of the current pixel's row.
1778  *
1779  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1780  * adding 2 yields the TID of the pixel below the top pixel.
1781  */
1782 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1783 {
1784    unsigned tl_lanes[4], trbl_lanes[4];
1785    char name[32], type[8];
1786    LLVMValueRef tl, trbl;
1787    LLVMTypeRef result_type;
1788    LLVMValueRef result;
1789
1790    result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1791
1792    if (result_type == ctx->f16)
1793       val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1794    else if (result_type == ctx->v2f16)
1795       val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1796
1797    for (unsigned i = 0; i < 4; ++i) {
1798       tl_lanes[i] = i & mask;
1799       trbl_lanes[i] = (i & mask) + idx;
1800    }
1801
1802    tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1803    trbl =
1804       ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1805
1806    if (result_type == ctx->f16) {
1807       tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1808       trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1809    }
1810
1811    tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1812    trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1813    result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1814
1815    ac_build_type_name_for_intr(result_type, type, sizeof(type));
1816    snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1817
1818    return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1819 }
1820
1821 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1822 {
1823    LLVMValueRef args[2];
1824    args[0] = LLVMConstInt(ctx->i32, msg, false);
1825    args[1] = wave_id;
1826    ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1827 }
1828
1829 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1830 {
1831    LLVMValueRef msb =
1832       ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1833
1834    /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1835     * the index from LSB. Invert it by doing "31 - msb". */
1836    msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1837
1838    LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1839    LLVMValueRef cond =
1840       LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1841                   LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1842
1843    return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1844 }
1845
1846 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1847 {
1848    const char *intrin_name;
1849    LLVMTypeRef type;
1850    LLVMValueRef highest_bit;
1851    LLVMValueRef zero;
1852    unsigned bitsize;
1853
1854    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1855    switch (bitsize) {
1856    case 64:
1857       intrin_name = "llvm.ctlz.i64";
1858       type = ctx->i64;
1859       highest_bit = LLVMConstInt(ctx->i64, 63, false);
1860       zero = ctx->i64_0;
1861       break;
1862    case 32:
1863       intrin_name = "llvm.ctlz.i32";
1864       type = ctx->i32;
1865       highest_bit = LLVMConstInt(ctx->i32, 31, false);
1866       zero = ctx->i32_0;
1867       break;
1868    case 16:
1869       intrin_name = "llvm.ctlz.i16";
1870       type = ctx->i16;
1871       highest_bit = LLVMConstInt(ctx->i16, 15, false);
1872       zero = ctx->i16_0;
1873       break;
1874    case 8:
1875       intrin_name = "llvm.ctlz.i8";
1876       type = ctx->i8;
1877       highest_bit = LLVMConstInt(ctx->i8, 7, false);
1878       zero = ctx->i8_0;
1879       break;
1880    default:
1881       unreachable(!"invalid bitsize");
1882       break;
1883    }
1884
1885    LLVMValueRef params[2] = {
1886       arg,
1887       ctx->i1true,
1888    };
1889
1890    LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1891
1892    /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1893     * the index from LSB. Invert it by doing "31 - msb". */
1894    msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1895
1896    if (bitsize == 64) {
1897       msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1898    } else if (bitsize < 32) {
1899       msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1900    }
1901
1902    /* check for zero */
1903    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1904                           LLVMConstInt(ctx->i32, -1, true), msb, "");
1905 }
1906
1907 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1908 {
1909    char name[64], type[64];
1910
1911    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1912    snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1913    LLVMValueRef args[2] = {a, b};
1914    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1915 }
1916
1917 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1918 {
1919    char name[64], type[64];
1920
1921    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1922    snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1923    LLVMValueRef args[2] = {a, b};
1924    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1925 }
1926
1927 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1928 {
1929    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1930    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1931 }
1932
1933 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934 {
1935    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1936    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1937 }
1938
1939 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1940 {
1941    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1942    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1943 }
1944
1945 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1946 {
1947    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1948    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1949 }
1950
1951 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1952 {
1953    LLVMTypeRef t = LLVMTypeOf(value);
1954    return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1955                         LLVMConstReal(t, 1.0));
1956 }
1957
1958 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1959 {
1960    LLVMValueRef args[9];
1961
1962    args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1963    args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1964
1965    if (a->compr) {
1966       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1967       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1968       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1969       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1970
1971       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1972    } else {
1973       args[2] = a->out[0];
1974       args[3] = a->out[1];
1975       args[4] = a->out[2];
1976       args[5] = a->out[3];
1977       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1978       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1979
1980       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1981    }
1982 }
1983
1984 void ac_build_export_null(struct ac_llvm_context *ctx)
1985 {
1986    struct ac_export_args args;
1987
1988    args.enabled_channels = 0x0; /* enabled channels */
1989    args.valid_mask = 1;         /* whether the EXEC mask is valid */
1990    args.done = 1;               /* DONE bit */
1991    args.target = V_008DFC_SQ_EXP_NULL;
1992    args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
1993    args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1994    args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1995    args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1996    args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1997
1998    ac_build_export(ctx, &args);
1999 }
2000
2001 static unsigned ac_num_coords(enum ac_image_dim dim)
2002 {
2003    switch (dim) {
2004    case ac_image_1d:
2005       return 1;
2006    case ac_image_2d:
2007    case ac_image_1darray:
2008       return 2;
2009    case ac_image_3d:
2010    case ac_image_cube:
2011    case ac_image_2darray:
2012    case ac_image_2dmsaa:
2013       return 3;
2014    case ac_image_2darraymsaa:
2015       return 4;
2016    default:
2017       unreachable("ac_num_coords: bad dim");
2018    }
2019 }
2020
2021 static unsigned ac_num_derivs(enum ac_image_dim dim)
2022 {
2023    switch (dim) {
2024    case ac_image_1d:
2025    case ac_image_1darray:
2026       return 2;
2027    case ac_image_2d:
2028    case ac_image_2darray:
2029    case ac_image_cube:
2030       return 4;
2031    case ac_image_3d:
2032       return 6;
2033    case ac_image_2dmsaa:
2034    case ac_image_2darraymsaa:
2035    default:
2036       unreachable("derivatives not supported");
2037    }
2038 }
2039
2040 static const char *get_atomic_name(enum ac_atomic_op op)
2041 {
2042    switch (op) {
2043    case ac_atomic_swap:
2044       return "swap";
2045    case ac_atomic_add:
2046       return "add";
2047    case ac_atomic_sub:
2048       return "sub";
2049    case ac_atomic_smin:
2050       return "smin";
2051    case ac_atomic_umin:
2052       return "umin";
2053    case ac_atomic_smax:
2054       return "smax";
2055    case ac_atomic_umax:
2056       return "umax";
2057    case ac_atomic_and:
2058       return "and";
2059    case ac_atomic_or:
2060       return "or";
2061    case ac_atomic_xor:
2062       return "xor";
2063    case ac_atomic_inc_wrap:
2064       return "inc";
2065    case ac_atomic_dec_wrap:
2066       return "dec";
2067    }
2068    unreachable("bad atomic op");
2069 }
2070
2071 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2072 {
2073    const char *overload[3] = {"", "", ""};
2074    unsigned num_overloads = 0;
2075    LLVMValueRef args[18];
2076    unsigned num_args = 0;
2077    enum ac_image_dim dim = a->dim;
2078
2079    assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2080    assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2081            a->opcode != ac_image_store_mip) ||
2082           a->lod);
2083    assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2084           (!a->compare && !a->offset));
2085    assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2086            a->opcode == ac_image_get_lod) ||
2087           !a->bias);
2088    assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2089           1);
2090    assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2091    assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&
2092                       a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2093                       a->opcode != ac_image_get_resinfo));
2094
2095    if (a->opcode == ac_image_get_lod) {
2096       switch (dim) {
2097       case ac_image_1darray:
2098          dim = ac_image_1d;
2099          break;
2100       case ac_image_2darray:
2101       case ac_image_cube:
2102          dim = ac_image_2d;
2103          break;
2104       default:
2105          break;
2106       }
2107    }
2108
2109    bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2110                  a->opcode == ac_image_get_lod;
2111    bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2112    bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2113                a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2114    LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2115
2116    if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2117       args[num_args++] = a->data[0];
2118       if (a->opcode == ac_image_atomic_cmpswap)
2119          args[num_args++] = a->data[1];
2120    }
2121
2122    if (!atomic)
2123       args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2124
2125    if (a->offset)
2126       args[num_args++] = ac_to_integer(ctx, a->offset);
2127    if (a->bias) {
2128       args[num_args++] = ac_to_float(ctx, a->bias);
2129       overload[num_overloads++] = ".f32";
2130    }
2131    if (a->compare)
2132       args[num_args++] = ac_to_float(ctx, a->compare);
2133    if (a->derivs[0]) {
2134       unsigned count = ac_num_derivs(dim);
2135       for (unsigned i = 0; i < count; ++i)
2136          args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2137       overload[num_overloads++] = ".f32";
2138    }
2139    unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2140    for (unsigned i = 0; i < num_coords; ++i)
2141       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2142    if (a->lod)
2143       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2144    if (a->min_lod)
2145       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2146
2147    overload[num_overloads++] = sample ? ".f32" : ".i32";
2148
2149    args[num_args++] = a->resource;
2150    if (sample) {
2151       args[num_args++] = a->sampler;
2152       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2153    }
2154
2155    args[num_args++] = ctx->i32_0; /* texfailctrl */
2156    args[num_args++] = LLVMConstInt(
2157       ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2158
2159    const char *name;
2160    const char *atomic_subop = "";
2161    switch (a->opcode) {
2162    case ac_image_sample:
2163       name = "sample";
2164       break;
2165    case ac_image_gather4:
2166       name = "gather4";
2167       break;
2168    case ac_image_load:
2169       name = "load";
2170       break;
2171    case ac_image_load_mip:
2172       name = "load.mip";
2173       break;
2174    case ac_image_store:
2175       name = "store";
2176       break;
2177    case ac_image_store_mip:
2178       name = "store.mip";
2179       break;
2180    case ac_image_atomic:
2181       name = "atomic.";
2182       atomic_subop = get_atomic_name(a->atomic);
2183       break;
2184    case ac_image_atomic_cmpswap:
2185       name = "atomic.";
2186       atomic_subop = "cmpswap";
2187       break;
2188    case ac_image_get_lod:
2189       name = "getlod";
2190       break;
2191    case ac_image_get_resinfo:
2192       name = "getresinfo";
2193       break;
2194    default:
2195       unreachable("invalid image opcode");
2196    }
2197
2198    const char *dimname;
2199    switch (dim) {
2200    case ac_image_1d:
2201       dimname = "1d";
2202       break;
2203    case ac_image_2d:
2204       dimname = "2d";
2205       break;
2206    case ac_image_3d:
2207       dimname = "3d";
2208       break;
2209    case ac_image_cube:
2210       dimname = "cube";
2211       break;
2212    case ac_image_1darray:
2213       dimname = "1darray";
2214       break;
2215    case ac_image_2darray:
2216       dimname = "2darray";
2217       break;
2218    case ac_image_2dmsaa:
2219       dimname = "2dmsaa";
2220       break;
2221    case ac_image_2darraymsaa:
2222       dimname = "2darraymsaa";
2223       break;
2224    default:
2225       unreachable("invalid dim");
2226    }
2227
2228    bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2229    char intr_name[96];
2230    snprintf(intr_name, sizeof(intr_name),
2231             "llvm.amdgcn.image.%s%s" /* base name */
2232             "%s%s%s%s"               /* sample/gather modifiers */
2233             ".%s.%s%s%s%s",          /* dimension and type overloads */
2234             name, atomic_subop, a->compare ? ".c" : "",
2235             a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2236             a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2237             atomic ? "i32" : (a->d16 ? "v4f16" : "v4f32"), overload[0], overload[1], overload[2]);
2238
2239    LLVMTypeRef retty;
2240    if (atomic)
2241       retty = ctx->i32;
2242    else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2243       retty = ctx->voidt;
2244    else
2245       retty = a->d16 ? ctx->v4f16 : ctx->v4f32;
2246
2247    LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2248    if (!sample && !atomic && retty != ctx->voidt)
2249       result = ac_to_integer(ctx, result);
2250
2251    return result;
2252 }
2253
2254 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2255 {
2256    LLVMValueRef samples;
2257
2258    /* Read the samples from the descriptor directly.
2259     * Hardware doesn't have any instruction for this.
2260     */
2261    samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2262    samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2263    samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2264    samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2265    return samples;
2266 }
2267
2268 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2269 {
2270    return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2271                              AC_FUNC_ATTR_READNONE);
2272 }
2273
2274 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2275 {
2276    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2277                                          AC_FUNC_ATTR_READNONE);
2278    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2279 }
2280
2281 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2282 {
2283    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2284                                          AC_FUNC_ATTR_READNONE);
2285    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2286 }
2287
2288 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2289 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2290                                  bool hi)
2291 {
2292    assert(bits == 8 || bits == 10 || bits == 16);
2293
2294    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2295    LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2296    LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2297    LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2298
2299    /* Clamp. */
2300    if (bits != 16) {
2301       for (int i = 0; i < 2; i++) {
2302          bool alpha = hi && i == 1;
2303          args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2304          args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2305       }
2306    }
2307
2308    LLVMValueRef res =
2309       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2310    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2311 }
2312
2313 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2314 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2315                                  bool hi)
2316 {
2317    assert(bits == 8 || bits == 10 || bits == 16);
2318
2319    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2320    LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2321
2322    /* Clamp. */
2323    if (bits != 16) {
2324       for (int i = 0; i < 2; i++) {
2325          bool alpha = hi && i == 1;
2326          args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2327       }
2328    }
2329
2330    LLVMValueRef res =
2331       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2332    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2333 }
2334
2335 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2336 {
2337    return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2338 }
2339
2340 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2341 {
2342    ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2343 }
2344
2345 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2346                           LLVMValueRef width, bool is_signed)
2347 {
2348    LLVMValueRef args[] = {
2349       input,
2350       offset,
2351       width,
2352    };
2353
2354    return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2355                              ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2356 }
2357
2358 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2359                            LLVMValueRef s2)
2360 {
2361    return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2362 }
2363
2364 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2365                            LLVMValueRef s2)
2366 {
2367    /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2368    if (ctx->chip_class >= GFX10) {
2369       return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2370                                 AC_FUNC_ATTR_READNONE);
2371    }
2372
2373    return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2374 }
2375
2376 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2377 {
2378    if (!wait_flags)
2379       return;
2380
2381    unsigned lgkmcnt = 63;
2382    unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2383    unsigned vscnt = 63;
2384
2385    if (wait_flags & AC_WAIT_LGKM)
2386       lgkmcnt = 0;
2387    if (wait_flags & AC_WAIT_VLOAD)
2388       vmcnt = 0;
2389
2390    if (wait_flags & AC_WAIT_VSTORE) {
2391       if (ctx->chip_class >= GFX10)
2392          vscnt = 0;
2393       else
2394          vmcnt = 0;
2395    }
2396
2397    /* There is no intrinsic for vscnt(0), so use a fence. */
2398    if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2399        vscnt == 0) {
2400       LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2401       return;
2402    }
2403
2404    unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */
2405                      (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2406
2407    LLVMValueRef args[1] = {
2408       LLVMConstInt(ctx->i32, simm16, false),
2409    };
2410    ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2411 }
2412
2413 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2414 {
2415    LLVMTypeRef type;
2416    char *intr;
2417
2418    if (bitsize == 16) {
2419       intr = "llvm.amdgcn.fract.f16";
2420       type = ctx->f16;
2421    } else if (bitsize == 32) {
2422       intr = "llvm.amdgcn.fract.f32";
2423       type = ctx->f32;
2424    } else {
2425       intr = "llvm.amdgcn.fract.f64";
2426       type = ctx->f64;
2427    }
2428
2429    LLVMValueRef params[] = {
2430       src0,
2431    };
2432    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2433 }
2434
2435 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2436 {
2437
2438    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2439       LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2440       unsigned vec_size = LLVMGetVectorSize(type);
2441       LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef *));
2442
2443       for (unsigned i = 0; i < vec_size; i++)
2444          scalars[i] = scalar;
2445       return LLVMConstVector(scalars, vec_size);
2446    }
2447    return LLVMConstInt(type, value, 0);
2448 }
2449
2450 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2451 {
2452    LLVMTypeRef type = LLVMTypeOf(src0);
2453    LLVMValueRef val;
2454
2455    /* v_med3 is selected only when max is first. (LLVM bug?) */
2456    val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2457    return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2458 }
2459
2460 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2461 {
2462    ac_enable_signed_zeros(ctx);
2463    /* (val + 0) converts negative zero to positive zero. */
2464    val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2465    ac_disable_signed_zeros(ctx);
2466    return val;
2467 }
2468
2469 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2470 {
2471    LLVMTypeRef type = LLVMTypeOf(src);
2472    LLVMValueRef pos, neg, dw[2], val;
2473    unsigned bitsize = ac_get_elem_bits(ctx, type);
2474
2475    /* The standard version leads to this:
2476     *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2477     *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2478     *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2479     *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2480     *
2481     * The isign version:
2482     *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2483     *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2484     *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2485     *
2486     * (src0 + 0) converts negative zero to positive zero.
2487     * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2488     *
2489     * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2490     * reduction. (FP64 comparisons are as fast as int64 comparisons)
2491     */
2492    if (bitsize == 16 || bitsize == 32) {
2493       val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2494       val = ac_build_isign(ctx, val);
2495       return LLVMBuildSIToFP(ctx->builder, val, type, "");
2496    }
2497
2498    assert(bitsize == 64);
2499    pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2500    neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2501    dw[0] = ctx->i32_0;
2502    dw[1] = LLVMBuildSelect(
2503       ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2504       LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2505       "");
2506    return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2507 }
2508
2509 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2510 {
2511    LLVMValueRef result;
2512    unsigned bitsize;
2513
2514    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2515
2516    switch (bitsize) {
2517    case 128:
2518       result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2519                                   AC_FUNC_ATTR_READNONE);
2520       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2521       break;
2522    case 64:
2523       result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2524                                   AC_FUNC_ATTR_READNONE);
2525
2526       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2527       break;
2528    case 32:
2529       result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2530                                   AC_FUNC_ATTR_READNONE);
2531       break;
2532    case 16:
2533       result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2534                                   AC_FUNC_ATTR_READNONE);
2535
2536       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2537       break;
2538    case 8:
2539       result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2540                                   AC_FUNC_ATTR_READNONE);
2541
2542       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2543       break;
2544    default:
2545       unreachable(!"invalid bitsize");
2546       break;
2547    }
2548
2549    return result;
2550 }
2551
2552 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2553 {
2554    LLVMValueRef result;
2555    unsigned bitsize;
2556
2557    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2558
2559    switch (bitsize) {
2560    case 64:
2561       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2562                                   AC_FUNC_ATTR_READNONE);
2563
2564       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2565       break;
2566    case 32:
2567       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2568                                   AC_FUNC_ATTR_READNONE);
2569       break;
2570    case 16:
2571       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2572                                   AC_FUNC_ATTR_READNONE);
2573
2574       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2575       break;
2576    case 8:
2577       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2578                                   AC_FUNC_ATTR_READNONE);
2579
2580       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2581       break;
2582    default:
2583       unreachable(!"invalid bitsize");
2584       break;
2585    }
2586
2587    return result;
2588 }
2589
2590 #define AC_EXP_TARGET           0
2591 #define AC_EXP_ENABLED_CHANNELS 1
2592 #define AC_EXP_OUT0             2
2593
2594 enum ac_ir_type
2595 {
2596    AC_IR_UNDEF,
2597    AC_IR_CONST,
2598    AC_IR_VALUE,
2599 };
2600
2601 struct ac_vs_exp_chan {
2602    LLVMValueRef value;
2603    float const_float;
2604    enum ac_ir_type type;
2605 };
2606
2607 struct ac_vs_exp_inst {
2608    unsigned offset;
2609    LLVMValueRef inst;
2610    struct ac_vs_exp_chan chan[4];
2611 };
2612
2613 struct ac_vs_exports {
2614    unsigned num;
2615    struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2616 };
2617
2618 /* Return true if the PARAM export has been eliminated. */
2619 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,
2620                                       struct ac_vs_exp_inst *exp)
2621 {
2622    unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2623    bool is_zero[4] = {}, is_one[4] = {};
2624
2625    for (i = 0; i < 4; i++) {
2626       /* It's a constant expression. Undef outputs are eliminated too. */
2627       if (exp->chan[i].type == AC_IR_UNDEF) {
2628          is_zero[i] = true;
2629          is_one[i] = true;
2630       } else if (exp->chan[i].type == AC_IR_CONST) {
2631          if (exp->chan[i].const_float == 0)
2632             is_zero[i] = true;
2633          else if (exp->chan[i].const_float == 1)
2634             is_one[i] = true;
2635          else
2636             return false; /* other constant */
2637       } else
2638          return false;
2639    }
2640
2641    /* Only certain combinations of 0 and 1 can be eliminated. */
2642    if (is_zero[0] && is_zero[1] && is_zero[2])
2643       default_val = is_zero[3] ? 0 : 1;
2644    else if (is_one[0] && is_one[1] && is_one[2])
2645       default_val = is_zero[3] ? 2 : 3;
2646    else
2647       return false;
2648
2649    /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2650    LLVMInstructionEraseFromParent(exp->inst);
2651
2652    /* Change OFFSET to DEFAULT_VAL. */
2653    for (i = 0; i < num_outputs; i++) {
2654       if (vs_output_param_offset[i] == exp->offset) {
2655          vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2656          break;
2657       }
2658    }
2659    return true;
2660 }
2661
2662 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2663                                            uint8_t *vs_output_param_offset, uint32_t num_outputs,
2664                                            struct ac_vs_exports *processed,
2665                                            struct ac_vs_exp_inst *exp)
2666 {
2667    unsigned p, copy_back_channels = 0;
2668
2669    /* See if the output is already in the list of processed outputs.
2670     * The LLVMValueRef comparison relies on SSA.
2671     */
2672    for (p = 0; p < processed->num; p++) {
2673       bool different = false;
2674
2675       for (unsigned j = 0; j < 4; j++) {
2676          struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2677          struct ac_vs_exp_chan *c2 = &exp->chan[j];
2678
2679          /* Treat undef as a match. */
2680          if (c2->type == AC_IR_UNDEF)
2681             continue;
2682
2683          /* If c1 is undef but c2 isn't, we can copy c2 to c1
2684           * and consider the instruction duplicated.
2685           */
2686          if (c1->type == AC_IR_UNDEF) {
2687             copy_back_channels |= 1 << j;
2688             continue;
2689          }
2690
2691          /* Test whether the channels are not equal. */
2692          if (c1->type != c2->type ||
2693              (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||
2694              (c1->type == AC_IR_VALUE && c1->value != c2->value)) {
2695             different = true;
2696             break;
2697          }
2698       }
2699       if (!different)
2700          break;
2701
2702       copy_back_channels = 0;
2703    }
2704    if (p == processed->num)
2705       return false;
2706
2707    /* If a match was found, but the matching export has undef where the new
2708     * one has a normal value, copy the normal value to the undef channel.
2709     */
2710    struct ac_vs_exp_inst *match = &processed->exp[p];
2711
2712    /* Get current enabled channels mask. */
2713    LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2714    unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2715
2716    while (copy_back_channels) {
2717       unsigned chan = u_bit_scan(&copy_back_channels);
2718
2719       assert(match->chan[chan].type == AC_IR_UNDEF);
2720       LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);
2721       match->chan[chan] = exp->chan[chan];
2722
2723       /* Update number of enabled channels because the original mask
2724        * is not always 0xf.
2725        */
2726       enabled_channels |= (1 << chan);
2727       LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2728                      LLVMConstInt(ctx->i32, enabled_channels, 0));
2729    }
2730
2731    /* The PARAM export is duplicated. Kill it. */
2732    LLVMInstructionEraseFromParent(exp->inst);
2733
2734    /* Change OFFSET to the matching export. */
2735    for (unsigned i = 0; i < num_outputs; i++) {
2736       if (vs_output_param_offset[i] == exp->offset) {
2737          vs_output_param_offset[i] = match->offset;
2738          break;
2739       }
2740    }
2741    return true;
2742 }
2743
2744 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,
2745                             uint8_t *vs_output_param_offset, uint32_t num_outputs,
2746                             uint32_t skip_output_mask, uint8_t *num_param_exports)
2747 {
2748    LLVMBasicBlockRef bb;
2749    bool removed_any = false;
2750    struct ac_vs_exports exports;
2751
2752    exports.num = 0;
2753
2754    /* Process all LLVM instructions. */
2755    bb = LLVMGetFirstBasicBlock(main_fn);
2756    while (bb) {
2757       LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2758
2759       while (inst) {
2760          LLVMValueRef cur = inst;
2761          inst = LLVMGetNextInstruction(inst);
2762          struct ac_vs_exp_inst exp;
2763
2764          if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2765             continue;
2766
2767          LLVMValueRef callee = ac_llvm_get_called_value(cur);
2768
2769          if (!ac_llvm_is_function(callee))
2770             continue;
2771
2772          const char *name = LLVMGetValueName(callee);
2773          unsigned num_args = LLVMCountParams(callee);
2774
2775          /* Check if this is an export instruction. */
2776          if ((num_args != 9 && num_args != 8) ||
2777              (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))
2778             continue;
2779
2780          LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2781          unsigned target = LLVMConstIntGetZExtValue(arg);
2782
2783          if (target < V_008DFC_SQ_EXP_PARAM)
2784             continue;
2785
2786          target -= V_008DFC_SQ_EXP_PARAM;
2787
2788          /* Parse the instruction. */
2789          memset(&exp, 0, sizeof(exp));
2790          exp.offset = target;
2791          exp.inst = cur;
2792
2793          for (unsigned i = 0; i < 4; i++) {
2794             LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2795
2796             exp.chan[i].value = v;
2797
2798             if (LLVMIsUndef(v)) {
2799                exp.chan[i].type = AC_IR_UNDEF;
2800             } else if (LLVMIsAConstantFP(v)) {
2801                LLVMBool loses_info;
2802                exp.chan[i].type = AC_IR_CONST;
2803                exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);
2804             } else {
2805                exp.chan[i].type = AC_IR_VALUE;
2806             }
2807          }
2808
2809          /* Eliminate constant and duplicated PARAM exports. */
2810          if (!((1u << target) & skip_output_mask) &&
2811              (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||
2812               ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,
2813                                              &exp))) {
2814             removed_any = true;
2815          } else {
2816             exports.exp[exports.num++] = exp;
2817          }
2818       }
2819       bb = LLVMGetNextBasicBlock(bb);
2820    }
2821
2822    /* Remove holes in export memory due to removed PARAM exports.
2823     * This is done by renumbering all PARAM exports.
2824     */
2825    if (removed_any) {
2826       uint8_t old_offset[VARYING_SLOT_MAX];
2827       unsigned out, i;
2828
2829       /* Make a copy of the offsets. We need the old version while
2830        * we are modifying some of them. */
2831       memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));
2832
2833       for (i = 0; i < exports.num; i++) {
2834          unsigned offset = exports.exp[i].offset;
2835
2836          /* Update vs_output_param_offset. Multiple outputs can
2837           * have the same offset.
2838           */
2839          for (out = 0; out < num_outputs; out++) {
2840             if (old_offset[out] == offset)
2841                vs_output_param_offset[out] = i;
2842          }
2843
2844          /* Change the PARAM offset in the instruction. */
2845          LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2846                         LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));
2847       }
2848       *num_param_exports = exports.num;
2849    }
2850 }
2851
2852 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2853 {
2854    LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2855    ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2856                       AC_FUNC_ATTR_CONVERGENT);
2857 }
2858
2859 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2860 {
2861    unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
2862    ctx->lds = LLVMBuildIntToPtr(
2863       ctx->builder, ctx->i32_0,
2864       LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2865 }
2866
2867 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2868 {
2869    return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2870 }
2871
2872 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2873 {
2874    value = ac_to_integer(ctx, value);
2875    ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2876 }
2877
2878 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2879 {
2880    unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2881    const char *intrin_name;
2882    LLVMTypeRef type;
2883    LLVMValueRef zero;
2884
2885    switch (src0_bitsize) {
2886    case 64:
2887       intrin_name = "llvm.cttz.i64";
2888       type = ctx->i64;
2889       zero = ctx->i64_0;
2890       break;
2891    case 32:
2892       intrin_name = "llvm.cttz.i32";
2893       type = ctx->i32;
2894       zero = ctx->i32_0;
2895       break;
2896    case 16:
2897       intrin_name = "llvm.cttz.i16";
2898       type = ctx->i16;
2899       zero = ctx->i16_0;
2900       break;
2901    case 8:
2902       intrin_name = "llvm.cttz.i8";
2903       type = ctx->i8;
2904       zero = ctx->i8_0;
2905       break;
2906    default:
2907       unreachable(!"invalid bitsize");
2908    }
2909
2910    LLVMValueRef params[2] = {
2911       src0,
2912
2913       /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2914        * add special code to check for x=0. The reason is that
2915        * the LLVM behavior for x=0 is different from what we
2916        * need here. However, LLVM also assumes that ffs(x) is
2917        * in [0, 31], but GLSL expects that ffs(0) = -1, so
2918        * a conditional assignment to handle 0 is still required.
2919        *
2920        * The hardware already implements the correct behavior.
2921        */
2922       ctx->i1true,
2923    };
2924
2925    LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
2926
2927    if (src0_bitsize == 64) {
2928       lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2929    } else if (src0_bitsize < 32) {
2930       lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2931    }
2932
2933    /* TODO: We need an intrinsic to skip this conditional. */
2934    /* Check for zero: */
2935    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2936                           LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2937 }
2938
2939 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2940 {
2941    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2942 }
2943
2944 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2945 {
2946    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2947 }
2948
2949 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2950 {
2951    if (ctx->flow->depth > 0)
2952       return &ctx->flow->stack[ctx->flow->depth - 1];
2953    return NULL;
2954 }
2955
2956 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2957 {
2958    for (unsigned i = ctx->flow->depth; i > 0; --i) {
2959       if (ctx->flow->stack[i - 1].loop_entry_block)
2960          return &ctx->flow->stack[i - 1];
2961    }
2962    return NULL;
2963 }
2964
2965 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2966 {
2967    struct ac_llvm_flow *flow;
2968
2969    if (ctx->flow->depth >= ctx->flow->depth_max) {
2970       unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2971
2972       ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2973       ctx->flow->depth_max = new_max;
2974    }
2975
2976    flow = &ctx->flow->stack[ctx->flow->depth];
2977    ctx->flow->depth++;
2978
2979    flow->next_block = NULL;
2980    flow->loop_entry_block = NULL;
2981    return flow;
2982 }
2983
2984 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2985 {
2986    char buf[32];
2987    snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2988    LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2989 }
2990
2991 /* Append a basic block at the level of the parent flow.
2992  */
2993 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2994 {
2995    assert(ctx->flow->depth >= 1);
2996
2997    if (ctx->flow->depth >= 2) {
2998       struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2999
3000       return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
3001    }
3002
3003    LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3004    return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3005 }
3006
3007 /* Emit a branch to the given default target for the current block if
3008  * applicable -- that is, if the current block does not already contain a
3009  * branch from a break or continue.
3010  */
3011 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
3012 {
3013    if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3014       LLVMBuildBr(builder, target);
3015 }
3016
3017 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3018 {
3019    struct ac_llvm_flow *flow = push_flow(ctx);
3020    flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3021    flow->next_block = append_basic_block(ctx, "ENDLOOP");
3022    set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3023    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3024    LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3025 }
3026
3027 void ac_build_break(struct ac_llvm_context *ctx)
3028 {
3029    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3030    LLVMBuildBr(ctx->builder, flow->next_block);
3031 }
3032
3033 void ac_build_continue(struct ac_llvm_context *ctx)
3034 {
3035    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3036    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3037 }
3038
3039 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3040 {
3041    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3042    LLVMBasicBlockRef endif_block;
3043
3044    assert(!current_branch->loop_entry_block);
3045
3046    endif_block = append_basic_block(ctx, "ENDIF");
3047    emit_default_branch(ctx->builder, endif_block);
3048
3049    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3050    set_basicblock_name(current_branch->next_block, "else", label_id);
3051
3052    current_branch->next_block = endif_block;
3053 }
3054
3055 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3056 {
3057    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3058
3059    assert(!current_branch->loop_entry_block);
3060
3061    emit_default_branch(ctx->builder, current_branch->next_block);
3062    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3063    set_basicblock_name(current_branch->next_block, "endif", label_id);
3064
3065    ctx->flow->depth--;
3066 }
3067
3068 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3069 {
3070    struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3071
3072    assert(current_loop->loop_entry_block);
3073
3074    emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3075
3076    LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3077    set_basicblock_name(current_loop->next_block, "endloop", label_id);
3078    ctx->flow->depth--;
3079 }
3080
3081 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3082 {
3083    struct ac_llvm_flow *flow = push_flow(ctx);
3084    LLVMBasicBlockRef if_block;
3085
3086    if_block = append_basic_block(ctx, "IF");
3087    flow->next_block = append_basic_block(ctx, "ELSE");
3088    set_basicblock_name(if_block, "if", label_id);
3089    LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3090    LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3091 }
3092
3093 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, int label_id)
3094 {
3095    LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, value, ctx->f32_0, "");
3096    ac_build_ifcc(ctx, cond, label_id);
3097 }
3098
3099 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, int label_id)
3100 {
3101    LLVMValueRef cond =
3102       LLVMBuildICmp(ctx->builder, LLVMIntNE, ac_to_integer(ctx, value), ctx->i32_0, "");
3103    ac_build_ifcc(ctx, cond, label_id);
3104 }
3105
3106 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3107 {
3108    LLVMBuilderRef builder = ac->builder;
3109    LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3110    LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3111    LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3112    LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3113    LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3114    LLVMValueRef res;
3115
3116    if (first_instr) {
3117       LLVMPositionBuilderBefore(first_builder, first_instr);
3118    } else {
3119       LLVMPositionBuilderAtEnd(first_builder, first_block);
3120    }
3121
3122    res = LLVMBuildAlloca(first_builder, type, name);
3123    LLVMDisposeBuilder(first_builder);
3124    return res;
3125 }
3126
3127 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3128 {
3129    LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3130    LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3131    return ptr;
3132 }
3133
3134 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3135 {
3136    int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3137    return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3138 }
3139
3140 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3141 {
3142    unsigned num_components = ac_get_llvm_num_components(value);
3143    if (count == num_components)
3144       return value;
3145
3146    LLVMValueRef masks[MAX2(count, 2)];
3147    masks[0] = ctx->i32_0;
3148    masks[1] = ctx->i32_1;
3149    for (unsigned i = 2; i < count; i++)
3150       masks[i] = LLVMConstInt(ctx->i32, i, false);
3151
3152    if (count == 1)
3153       return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3154
3155    LLVMValueRef swizzle = LLVMConstVector(masks, count);
3156    return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3157 }
3158
3159 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3160                              unsigned bitwidth)
3161 {
3162    LLVMValueRef value = param;
3163    if (rshift)
3164       value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(ctx->i32, rshift, false), "");
3165
3166    if (rshift + bitwidth < 32) {
3167       unsigned mask = (1 << bitwidth) - 1;
3168       value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(ctx->i32, mask, false), "");
3169    }
3170    return value;
3171 }
3172
3173 /* Adjust the sample index according to FMASK.
3174  *
3175  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3176  * which is the identity mapping. Each nibble says which physical sample
3177  * should be fetched to get that sample.
3178  *
3179  * For example, 0x11111100 means there are only 2 samples stored and
3180  * the second sample covers 3/4 of the pixel. When reading samples 0
3181  * and 1, return physical sample 0 (determined by the first two 0s
3182  * in FMASK), otherwise return physical sample 1.
3183  *
3184  * The sample index should be adjusted as follows:
3185  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3186  */
3187 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3188                               bool is_array_tex)
3189 {
3190    struct ac_image_args fmask_load = {};
3191    fmask_load.opcode = ac_image_load;
3192    fmask_load.resource = fmask;
3193    fmask_load.dmask = 0xf;
3194    fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3195    fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3196
3197    fmask_load.coords[0] = addr[0];
3198    fmask_load.coords[1] = addr[1];
3199    if (is_array_tex)
3200       fmask_load.coords[2] = addr[2];
3201
3202    LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3203    fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3204
3205    /* Apply the formula. */
3206    unsigned sample_chan = is_array_tex ? 3 : 2;
3207    LLVMValueRef final_sample;
3208    final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], LLVMConstInt(ac->i32, 4, 0), "");
3209    final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3210    /* Mask the sample index by 0x7, because 0x8 means an unknown value
3211     * with EQAA, so those will map to 0. */
3212    final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3213
3214    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3215     * resource descriptor is 0 (invalid).
3216     */
3217    LLVMValueRef tmp;
3218    tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3219    tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3220    tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3221
3222    /* Replace the MSAA sample index. */
3223    addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], "");
3224 }
3225
3226 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3227                                        LLVMValueRef lane, bool with_opt_barrier)
3228 {
3229    LLVMTypeRef type = LLVMTypeOf(src);
3230    LLVMValueRef result;
3231
3232    if (with_opt_barrier)
3233       ac_build_optimization_barrier(ctx, &src);
3234
3235    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3236    if (lane)
3237       lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3238
3239    result =
3240       ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3241                          ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3242                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3243
3244    return LLVMBuildTrunc(ctx->builder, result, type, "");
3245 }
3246
3247 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3248                                              LLVMValueRef lane, bool with_opt_barrier)
3249 {
3250    LLVMTypeRef src_type = LLVMTypeOf(src);
3251    src = ac_to_integer(ctx, src);
3252    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3253    LLVMValueRef ret;
3254
3255    if (bits > 32) {
3256       assert(bits % 32 == 0);
3257       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3258       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3259       ret = LLVMGetUndef(vec_type);
3260       for (unsigned i = 0; i < bits / 32; i++) {
3261          LLVMValueRef ret_comp;
3262
3263          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3264
3265          ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3266
3267          ret =
3268             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3269       }
3270    } else {
3271       ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3272    }
3273
3274    if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3275       return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3276    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3277 }
3278
3279 /**
3280  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3281  *
3282  * The optimization barrier is not needed if the value is the same in all lanes
3283  * or if this is called in the outermost block.
3284  *
3285  * @param ctx
3286  * @param src
3287  * @param lane - id of the lane or NULL for the first active lane
3288  * @return value of the lane
3289  */
3290 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3291                                               LLVMValueRef lane)
3292 {
3293    return ac_build_readlane_common(ctx, src, lane, false);
3294 }
3295
3296 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3297 {
3298    return ac_build_readlane_common(ctx, src, lane, true);
3299 }
3300
3301 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3302                                 LLVMValueRef lane)
3303 {
3304    return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3305                              (LLVMValueRef[]){value, lane, src}, 3,
3306                              AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3307 }
3308
3309 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3310 {
3311    if (ctx->wave_size == 32) {
3312       return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3313                                 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3314    }
3315    LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3316    LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3317    LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3318    LLVMValueRef val =
3319       ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3320                          (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3321    val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3322                             2, AC_FUNC_ATTR_READNONE);
3323    return val;
3324 }
3325
3326 enum dpp_ctrl
3327 {
3328    _dpp_quad_perm = 0x000,
3329    _dpp_row_sl = 0x100,
3330    _dpp_row_sr = 0x110,
3331    _dpp_row_rr = 0x120,
3332    dpp_wf_sl1 = 0x130,
3333    dpp_wf_rl1 = 0x134,
3334    dpp_wf_sr1 = 0x138,
3335    dpp_wf_rr1 = 0x13C,
3336    dpp_row_mirror = 0x140,
3337    dpp_row_half_mirror = 0x141,
3338    dpp_row_bcast15 = 0x142,
3339    dpp_row_bcast31 = 0x143
3340 };
3341
3342 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3343                                           unsigned lane3)
3344 {
3345    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3346    return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3347 }
3348
3349 static inline enum dpp_ctrl dpp_row_sl(unsigned amount)
3350 {
3351    assert(amount > 0 && amount < 16);
3352    return _dpp_row_sl | amount;
3353 }
3354
3355 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3356 {
3357    assert(amount > 0 && amount < 16);
3358    return _dpp_row_sr | amount;
3359 }
3360
3361 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3362                                   enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3363                                   bool bound_ctrl)
3364 {
3365    LLVMTypeRef type = LLVMTypeOf(src);
3366    LLVMValueRef res;
3367
3368    old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3369    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3370
3371    res = ac_build_intrinsic(
3372       ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3373       (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3374                        LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3375                        LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3376       6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3377
3378    return LLVMBuildTrunc(ctx->builder, res, type, "");
3379 }
3380
3381 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3382                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3383                                  bool bound_ctrl)
3384 {
3385    LLVMTypeRef src_type = LLVMTypeOf(src);
3386    src = ac_to_integer(ctx, src);
3387    old = ac_to_integer(ctx, old);
3388    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3389    LLVMValueRef ret;
3390    if (bits > 32) {
3391       assert(bits % 32 == 0);
3392       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3393       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3394       LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3395       ret = LLVMGetUndef(vec_type);
3396       for (unsigned i = 0; i < bits / 32; i++) {
3397          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3398          old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3399          LLVMValueRef ret_comp =
3400             _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3401          ret =
3402             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3403       }
3404    } else {
3405       ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3406    }
3407    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3408 }
3409
3410 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3411                                          uint64_t sel, bool exchange_rows, bool bound_ctrl)
3412 {
3413    LLVMTypeRef type = LLVMTypeOf(src);
3414    LLVMValueRef result;
3415
3416    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3417
3418    LLVMValueRef args[6] = {
3419       src,
3420       src,
3421       LLVMConstInt(ctx->i32, sel, false),
3422       LLVMConstInt(ctx->i32, sel >> 32, false),
3423       ctx->i1true, /* fi */
3424       bound_ctrl ? ctx->i1true : ctx->i1false,
3425    };
3426
3427    result =
3428       ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3429                          ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3430
3431    return LLVMBuildTrunc(ctx->builder, result, type, "");
3432 }
3433
3434 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3435                                         bool exchange_rows, bool bound_ctrl)
3436 {
3437    LLVMTypeRef src_type = LLVMTypeOf(src);
3438    src = ac_to_integer(ctx, src);
3439    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3440    LLVMValueRef ret;
3441    if (bits > 32) {
3442       assert(bits % 32 == 0);
3443       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3444       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3445       ret = LLVMGetUndef(vec_type);
3446       for (unsigned i = 0; i < bits / 32; i++) {
3447          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3448          LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3449          ret =
3450             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3451       }
3452    } else {
3453       ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3454    }
3455    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3456 }
3457
3458 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3459 {
3460    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3461    return and_mask | (or_mask << 5) | (xor_mask << 10);
3462 }
3463
3464 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3465                                          unsigned mask)
3466 {
3467    LLVMTypeRef src_type = LLVMTypeOf(src);
3468    LLVMValueRef ret;
3469
3470    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3471
3472    ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3473                             (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3474                             AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3475
3476    return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3477 }
3478
3479 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3480 {
3481    LLVMTypeRef src_type = LLVMTypeOf(src);
3482    src = ac_to_integer(ctx, src);
3483    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3484    LLVMValueRef ret;
3485    if (bits > 32) {
3486       assert(bits % 32 == 0);
3487       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3488       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3489       ret = LLVMGetUndef(vec_type);
3490       for (unsigned i = 0; i < bits / 32; i++) {
3491          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3492          LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3493          ret =
3494             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3495       }
3496    } else {
3497       ret = _ac_build_ds_swizzle(ctx, src, mask);
3498    }
3499    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3500 }
3501
3502 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3503 {
3504    LLVMTypeRef src_type = LLVMTypeOf(src);
3505    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3506    char name[32], type[8];
3507    LLVMValueRef ret;
3508
3509    src = ac_to_integer(ctx, src);
3510
3511    if (bitsize < 32)
3512       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3513
3514    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3515    snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3516    ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3517                             AC_FUNC_ATTR_READNONE);
3518
3519    if (bitsize < 32)
3520       ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3521
3522    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3523 }
3524
3525 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3526                                           LLVMValueRef inactive)
3527 {
3528    char name[33], type[8];
3529    LLVMTypeRef src_type = LLVMTypeOf(src);
3530    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3531    src = ac_to_integer(ctx, src);
3532    inactive = ac_to_integer(ctx, inactive);
3533
3534    if (bitsize < 32) {
3535       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3536       inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3537    }
3538
3539    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3540    snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3541    LLVMValueRef ret =
3542       ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3543                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3544    if (bitsize < 32)
3545       ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3546
3547    return ret;
3548 }
3549
3550 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3551                                            unsigned type_size)
3552 {
3553    if (type_size == 1) {
3554       switch (op) {
3555       case nir_op_iadd:
3556          return ctx->i8_0;
3557       case nir_op_imul:
3558          return ctx->i8_1;
3559       case nir_op_imin:
3560          return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3561       case nir_op_umin:
3562          return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3563       case nir_op_imax:
3564          return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3565       case nir_op_umax:
3566          return ctx->i8_0;
3567       case nir_op_iand:
3568          return LLVMConstInt(ctx->i8, -1, 0);
3569       case nir_op_ior:
3570          return ctx->i8_0;
3571       case nir_op_ixor:
3572          return ctx->i8_0;
3573       default:
3574          unreachable("bad reduction intrinsic");
3575       }
3576    } else if (type_size == 2) {
3577       switch (op) {
3578       case nir_op_iadd:
3579          return ctx->i16_0;
3580       case nir_op_fadd:
3581          return ctx->f16_0;
3582       case nir_op_imul:
3583          return ctx->i16_1;
3584       case nir_op_fmul:
3585          return ctx->f16_1;
3586       case nir_op_imin:
3587          return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3588       case nir_op_umin:
3589          return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3590       case nir_op_fmin:
3591          return LLVMConstReal(ctx->f16, INFINITY);
3592       case nir_op_imax:
3593          return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3594       case nir_op_umax:
3595          return ctx->i16_0;
3596       case nir_op_fmax:
3597          return LLVMConstReal(ctx->f16, -INFINITY);
3598       case nir_op_iand:
3599          return LLVMConstInt(ctx->i16, -1, 0);
3600       case nir_op_ior:
3601          return ctx->i16_0;
3602       case nir_op_ixor:
3603          return ctx->i16_0;
3604       default:
3605          unreachable("bad reduction intrinsic");
3606       }
3607    } else if (type_size == 4) {
3608       switch (op) {
3609       case nir_op_iadd:
3610          return ctx->i32_0;
3611       case nir_op_fadd:
3612          return ctx->f32_0;
3613       case nir_op_imul:
3614          return ctx->i32_1;
3615       case nir_op_fmul:
3616          return ctx->f32_1;
3617       case nir_op_imin:
3618          return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3619       case nir_op_umin:
3620          return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3621       case nir_op_fmin:
3622          return LLVMConstReal(ctx->f32, INFINITY);
3623       case nir_op_imax:
3624          return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3625       case nir_op_umax:
3626          return ctx->i32_0;
3627       case nir_op_fmax:
3628          return LLVMConstReal(ctx->f32, -INFINITY);
3629       case nir_op_iand:
3630          return LLVMConstInt(ctx->i32, -1, 0);
3631       case nir_op_ior:
3632          return ctx->i32_0;
3633       case nir_op_ixor:
3634          return ctx->i32_0;
3635       default:
3636          unreachable("bad reduction intrinsic");
3637       }
3638    } else { /* type_size == 64bit */
3639       switch (op) {
3640       case nir_op_iadd:
3641          return ctx->i64_0;
3642       case nir_op_fadd:
3643          return ctx->f64_0;
3644       case nir_op_imul:
3645          return ctx->i64_1;
3646       case nir_op_fmul:
3647          return ctx->f64_1;
3648       case nir_op_imin:
3649          return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3650       case nir_op_umin:
3651          return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3652       case nir_op_fmin:
3653          return LLVMConstReal(ctx->f64, INFINITY);
3654       case nir_op_imax:
3655          return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3656       case nir_op_umax:
3657          return ctx->i64_0;
3658       case nir_op_fmax:
3659          return LLVMConstReal(ctx->f64, -INFINITY);
3660       case nir_op_iand:
3661          return LLVMConstInt(ctx->i64, -1, 0);
3662       case nir_op_ior:
3663          return ctx->i64_0;
3664       case nir_op_ixor:
3665          return ctx->i64_0;
3666       default:
3667          unreachable("bad reduction intrinsic");
3668       }
3669    }
3670 }
3671
3672 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3673                                     nir_op op)
3674 {
3675    bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3676    bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3677    switch (op) {
3678    case nir_op_iadd:
3679       return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3680    case nir_op_fadd:
3681       return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3682    case nir_op_imul:
3683       return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3684    case nir_op_fmul:
3685       return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3686    case nir_op_imin:
3687       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3688                              lhs, rhs, "");
3689    case nir_op_umin:
3690       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3691                              lhs, rhs, "");
3692    case nir_op_fmin:
3693       return ac_build_intrinsic(
3694          ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3695          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3696          AC_FUNC_ATTR_READNONE);
3697    case nir_op_imax:
3698       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3699                              lhs, rhs, "");
3700    case nir_op_umax:
3701       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3702                              lhs, rhs, "");
3703    case nir_op_fmax:
3704       return ac_build_intrinsic(
3705          ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3706          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3707          AC_FUNC_ATTR_READNONE);
3708    case nir_op_iand:
3709       return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3710    case nir_op_ior:
3711       return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3712    case nir_op_ixor:
3713       return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3714    default:
3715       unreachable("bad reduction intrinsic");
3716    }
3717 }
3718
3719 /**
3720  * \param src The value to shift.
3721  * \param identity The value to use the first lane.
3722  * \param maxprefix specifies that the result only needs to be correct for a
3723  *     prefix of this many threads
3724  * \return src, shifted 1 lane up, and identity shifted into lane 0.
3725  */
3726 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3727                                                LLVMValueRef identity, unsigned maxprefix)
3728 {
3729    if (ctx->chip_class >= GFX10) {
3730       /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3731       LLVMValueRef active, tmp1, tmp2;
3732       LLVMValueRef tid = ac_get_thread_id(ctx);
3733
3734       tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3735
3736       tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3737
3738       if (maxprefix > 32) {
3739          active =
3740             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3741
3742          tmp2 = LLVMBuildSelect(ctx->builder, active,
3743                                 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3744                                 tmp2, "");
3745
3746          active = LLVMBuildOr(
3747             ctx->builder, active,
3748             LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3749                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3750                           LLVMConstInt(ctx->i32, 0x10, false), ""),
3751             "");
3752          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3753       } else if (maxprefix > 16) {
3754          active =
3755             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3756
3757          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3758       }
3759    } else if (ctx->chip_class >= GFX8) {
3760       return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3761    }
3762
3763    /* wavefront shift_right by 1 on SI/CI */
3764    LLVMValueRef active, tmp1, tmp2;
3765    LLVMValueRef tid = ac_get_thread_id(ctx);
3766    tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3767    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3768    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3769                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3770                           LLVMConstInt(ctx->i32, 0x4, 0), "");
3771    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3772    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3773    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3774                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3775                           LLVMConstInt(ctx->i32, 0x8, 0), "");
3776    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3777    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3778    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3779                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3780                           LLVMConstInt(ctx->i32, 0x10, 0), "");
3781    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3782    tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3783    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3784    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3785    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3786    return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3787 }
3788
3789 /**
3790  * \param maxprefix specifies that the result only needs to be correct for a
3791  *     prefix of this many threads
3792  */
3793 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3794                                   LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3795 {
3796    LLVMValueRef result, tmp;
3797
3798    if (!inclusive)
3799       src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3800
3801    result = src;
3802
3803    if (ctx->chip_class <= GFX7) {
3804       assert(maxprefix == 64);
3805       LLVMValueRef tid = ac_get_thread_id(ctx);
3806       LLVMValueRef active;
3807       tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3808       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3809                              LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3810       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3811       result = ac_build_alu_op(ctx, result, tmp, op);
3812       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3813       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3814                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3815                              ctx->i32_0, "");
3816       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3817       result = ac_build_alu_op(ctx, result, tmp, op);
3818       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3819       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3820                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3821                              ctx->i32_0, "");
3822       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3823       result = ac_build_alu_op(ctx, result, tmp, op);
3824       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3825       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3826                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3827                              ctx->i32_0, "");
3828       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3829       result = ac_build_alu_op(ctx, result, tmp, op);
3830       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3831       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3832                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3833                              ctx->i32_0, "");
3834       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3835       result = ac_build_alu_op(ctx, result, tmp, op);
3836       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3837       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3838                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3839                              ctx->i32_0, "");
3840       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3841       result = ac_build_alu_op(ctx, result, tmp, op);
3842       return result;
3843    }
3844
3845    if (maxprefix <= 1)
3846       return result;
3847    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3848    result = ac_build_alu_op(ctx, result, tmp, op);
3849    if (maxprefix <= 2)
3850       return result;
3851    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3852    result = ac_build_alu_op(ctx, result, tmp, op);
3853    if (maxprefix <= 3)
3854       return result;
3855    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3856    result = ac_build_alu_op(ctx, result, tmp, op);
3857    if (maxprefix <= 4)
3858       return result;
3859    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3860    result = ac_build_alu_op(ctx, result, tmp, op);
3861    if (maxprefix <= 8)
3862       return result;
3863    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3864    result = ac_build_alu_op(ctx, result, tmp, op);
3865    if (maxprefix <= 16)
3866       return result;
3867
3868    if (ctx->chip_class >= GFX10) {
3869       LLVMValueRef tid = ac_get_thread_id(ctx);
3870       LLVMValueRef active;
3871
3872       tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3873
3874       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3875                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3876                              ctx->i32_0, "");
3877
3878       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3879
3880       result = ac_build_alu_op(ctx, result, tmp, op);
3881
3882       if (maxprefix <= 32)
3883          return result;
3884
3885       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3886
3887       active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3888
3889       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3890
3891       result = ac_build_alu_op(ctx, result, tmp, op);
3892       return result;
3893    }
3894
3895    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3896    result = ac_build_alu_op(ctx, result, tmp, op);
3897    if (maxprefix <= 32)
3898       return result;
3899    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3900    result = ac_build_alu_op(ctx, result, tmp, op);
3901    return result;
3902 }
3903
3904 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3905 {
3906    LLVMValueRef result;
3907
3908    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3909       LLVMBuilderRef builder = ctx->builder;
3910       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3911       result = ac_build_ballot(ctx, src);
3912       result = ac_build_mbcnt(ctx, result);
3913       result = LLVMBuildAdd(builder, result, src, "");
3914       return result;
3915    }
3916
3917    ac_build_optimization_barrier(ctx, &src);
3918
3919    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3920    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3921                              LLVMTypeOf(identity), "");
3922    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3923
3924    return ac_build_wwm(ctx, result);
3925 }
3926
3927 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3928 {
3929    LLVMValueRef result;
3930
3931    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3932       LLVMBuilderRef builder = ctx->builder;
3933       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3934       result = ac_build_ballot(ctx, src);
3935       result = ac_build_mbcnt(ctx, result);
3936       return result;
3937    }
3938
3939    ac_build_optimization_barrier(ctx, &src);
3940
3941    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3942    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3943                              LLVMTypeOf(identity), "");
3944    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3945
3946    return ac_build_wwm(ctx, result);
3947 }
3948
3949 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3950                              unsigned cluster_size)
3951 {
3952    if (cluster_size == 1)
3953       return src;
3954    ac_build_optimization_barrier(ctx, &src);
3955    LLVMValueRef result, swap;
3956    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3957    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3958                              LLVMTypeOf(identity), "");
3959    swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3960    result = ac_build_alu_op(ctx, result, swap, op);
3961    if (cluster_size == 2)
3962       return ac_build_wwm(ctx, result);
3963
3964    swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3965    result = ac_build_alu_op(ctx, result, swap, op);
3966    if (cluster_size == 4)
3967       return ac_build_wwm(ctx, result);
3968
3969    if (ctx->chip_class >= GFX8)
3970       swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3971    else
3972       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3973    result = ac_build_alu_op(ctx, result, swap, op);
3974    if (cluster_size == 8)
3975       return ac_build_wwm(ctx, result);
3976
3977    if (ctx->chip_class >= GFX8)
3978       swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3979    else
3980       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3981    result = ac_build_alu_op(ctx, result, swap, op);
3982    if (cluster_size == 16)
3983       return ac_build_wwm(ctx, result);
3984
3985    if (ctx->chip_class >= GFX10)
3986       swap = ac_build_permlane16(ctx, result, 0, true, false);
3987    else if (ctx->chip_class >= GFX8 && cluster_size != 32)
3988       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3989    else
3990       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3991    result = ac_build_alu_op(ctx, result, swap, op);
3992    if (cluster_size == 32)
3993       return ac_build_wwm(ctx, result);
3994
3995    if (ctx->chip_class >= GFX8) {
3996       if (ctx->wave_size == 64) {
3997          if (ctx->chip_class >= GFX10)
3998             swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3999          else
4000             swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4001          result = ac_build_alu_op(ctx, result, swap, op);
4002          result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4003       }
4004
4005       return ac_build_wwm(ctx, result);
4006    } else {
4007       swap = ac_build_readlane(ctx, result, ctx->i32_0);
4008       result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4009       result = ac_build_alu_op(ctx, result, swap, op);
4010       return ac_build_wwm(ctx, result);
4011    }
4012 }
4013
4014 /**
4015  * "Top half" of a scan that reduces per-wave values across an entire
4016  * workgroup.
4017  *
4018  * The source value must be present in the highest lane of the wave, and the
4019  * highest lane must be live.
4020  */
4021 void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4022 {
4023    if (ws->maxwaves <= 1)
4024       return;
4025
4026    const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4027    LLVMBuilderRef builder = ctx->builder;
4028    LLVMValueRef tid = ac_get_thread_id(ctx);
4029    LLVMValueRef tmp;
4030
4031    tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4032    ac_build_ifcc(ctx, tmp, 1000);
4033    LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4034    ac_build_endif(ctx, 1000);
4035 }
4036
4037 /**
4038  * "Bottom half" of a scan that reduces per-wave values across an entire
4039  * workgroup.
4040  *
4041  * The caller must place a barrier between the top and bottom halves.
4042  */
4043 void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4044 {
4045    const LLVMTypeRef type = LLVMTypeOf(ws->src);
4046    const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4047
4048    if (ws->maxwaves <= 1) {
4049       ws->result_reduce = ws->src;
4050       ws->result_inclusive = ws->src;
4051       ws->result_exclusive = identity;
4052       return;
4053    }
4054    assert(ws->maxwaves <= 32);
4055
4056    LLVMBuilderRef builder = ctx->builder;
4057    LLVMValueRef tid = ac_get_thread_id(ctx);
4058    LLVMBasicBlockRef bbs[2];
4059    LLVMValueRef phivalues_scan[2];
4060    LLVMValueRef tmp, tmp2;
4061
4062    bbs[0] = LLVMGetInsertBlock(builder);
4063    phivalues_scan[0] = LLVMGetUndef(type);
4064
4065    if (ws->enable_reduce)
4066       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4067    else if (ws->enable_inclusive)
4068       tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4069    else
4070       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4071    ac_build_ifcc(ctx, tmp, 1001);
4072    {
4073       tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4074
4075       ac_build_optimization_barrier(ctx, &tmp);
4076
4077       bbs[1] = LLVMGetInsertBlock(builder);
4078       phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4079    }
4080    ac_build_endif(ctx, 1001);
4081
4082    const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4083
4084    if (ws->enable_reduce) {
4085       tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4086       ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4087    }
4088    if (ws->enable_inclusive)
4089       ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4090    if (ws->enable_exclusive) {
4091       tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4092       tmp = ac_build_readlane(ctx, scan, tmp);
4093       tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4094       ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4095    }
4096 }
4097
4098 /**
4099  * Inclusive scan of a per-wave value across an entire workgroup.
4100  *
4101  * This implies an s_barrier instruction.
4102  *
4103  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4104  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4105  * useful manner because of the barrier in the algorithm.)
4106  */
4107 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4108 {
4109    ac_build_wg_wavescan_top(ctx, ws);
4110    ac_build_s_barrier(ctx);
4111    ac_build_wg_wavescan_bottom(ctx, ws);
4112 }
4113
4114 /**
4115  * "Top half" of a scan that reduces per-thread values across an entire
4116  * workgroup.
4117  *
4118  * All lanes must be active when this code runs.
4119  */
4120 void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4121 {
4122    if (ws->enable_exclusive) {
4123       ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4124       if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4125          ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4126       ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4127    } else {
4128       ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4129    }
4130
4131    bool enable_inclusive = ws->enable_inclusive;
4132    bool enable_exclusive = ws->enable_exclusive;
4133    ws->enable_inclusive = false;
4134    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4135    ac_build_wg_wavescan_top(ctx, ws);
4136    ws->enable_inclusive = enable_inclusive;
4137    ws->enable_exclusive = enable_exclusive;
4138 }
4139
4140 /**
4141  * "Bottom half" of a scan that reduces per-thread values across an entire
4142  * workgroup.
4143  *
4144  * The caller must place a barrier between the top and bottom halves.
4145  */
4146 void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4147 {
4148    bool enable_inclusive = ws->enable_inclusive;
4149    bool enable_exclusive = ws->enable_exclusive;
4150    ws->enable_inclusive = false;
4151    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4152    ac_build_wg_wavescan_bottom(ctx, ws);
4153    ws->enable_inclusive = enable_inclusive;
4154    ws->enable_exclusive = enable_exclusive;
4155
4156    /* ws->result_reduce is already the correct value */
4157    if (ws->enable_inclusive)
4158       ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4159    if (ws->enable_exclusive)
4160       ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4161 }
4162
4163 /**
4164  * A scan that reduces per-thread values across an entire workgroup.
4165  *
4166  * The caller must ensure that all lanes are active when this code runs
4167  * (WWM is insufficient!), because there is an implied barrier.
4168  */
4169 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4170 {
4171    ac_build_wg_scan_top(ctx, ws);
4172    ac_build_s_barrier(ctx);
4173    ac_build_wg_scan_bottom(ctx, ws);
4174 }
4175
4176 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4177                                    unsigned lane1, unsigned lane2, unsigned lane3)
4178 {
4179    unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4180    if (ctx->chip_class >= GFX8) {
4181       return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4182    } else {
4183       return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4184    }
4185 }
4186
4187 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4188 {
4189    LLVMTypeRef type = LLVMTypeOf(src);
4190    LLVMValueRef result;
4191
4192    index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4193    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4194
4195    result =
4196       ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4197                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4198    return LLVMBuildTrunc(ctx->builder, result, type, "");
4199 }
4200
4201 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4202 {
4203    LLVMTypeRef type;
4204    char *intr;
4205
4206    if (bitsize == 16) {
4207       intr = "llvm.amdgcn.frexp.exp.i16.f16";
4208       type = ctx->i16;
4209    } else if (bitsize == 32) {
4210       intr = "llvm.amdgcn.frexp.exp.i32.f32";
4211       type = ctx->i32;
4212    } else {
4213       intr = "llvm.amdgcn.frexp.exp.i32.f64";
4214       type = ctx->i32;
4215    }
4216
4217    LLVMValueRef params[] = {
4218       src0,
4219    };
4220    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4221 }
4222 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4223 {
4224    LLVMTypeRef type;
4225    char *intr;
4226
4227    if (bitsize == 16) {
4228       intr = "llvm.amdgcn.frexp.mant.f16";
4229       type = ctx->f16;
4230    } else if (bitsize == 32) {
4231       intr = "llvm.amdgcn.frexp.mant.f32";
4232       type = ctx->f32;
4233    } else {
4234       intr = "llvm.amdgcn.frexp.mant.f64";
4235       type = ctx->f64;
4236    }
4237
4238    LLVMValueRef params[] = {
4239       src0,
4240    };
4241    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4242 }
4243
4244 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4245 {
4246    LLVMTypeRef type;
4247    char *intr;
4248
4249    if (bitsize == 16) {
4250       intr = "llvm.canonicalize.f16";
4251       type = ctx->f16;
4252    } else if (bitsize == 32) {
4253       intr = "llvm.canonicalize.f32";
4254       type = ctx->f32;
4255    } else {
4256       intr = "llvm.canonicalize.f64";
4257       type = ctx->f64;
4258    }
4259
4260    LLVMValueRef params[] = {
4261       src0,
4262    };
4263    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4264 }
4265
4266 /*
4267  * this takes an I,J coordinate pair,
4268  * and works out the X and Y derivatives.
4269  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4270  */
4271 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4272 {
4273    LLVMValueRef result[4], a;
4274    unsigned i;
4275
4276    for (i = 0; i < 2; i++) {
4277       a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4278       result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4279       result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4280    }
4281    return ac_build_gather_values(ctx, result, 4);
4282 }
4283
4284 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4285 {
4286    LLVMValueRef result =
4287       ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4288    result = LLVMBuildNot(ctx->builder, result, "");
4289    return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4290 }
4291
4292 LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4293 {
4294    if (!ctx->postponed_kill)
4295       return ac_build_load_helper_invocation(ctx);
4296
4297    /* !(exact && postponed) */
4298    LLVMValueRef exact =
4299       ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4300
4301    LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
4302    LLVMValueRef result = LLVMBuildAnd(ctx->builder, exact, postponed, "");
4303
4304    return LLVMBuildSelect(ctx->builder, result, ctx->i32_0,
4305                           LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), "");
4306 }
4307
4308 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4309                            unsigned num_args)
4310 {
4311    LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4312    LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4313    return ret;
4314 }
4315
4316 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4317                      LLVMValueRef samplemask, struct ac_export_args *args)
4318 {
4319    unsigned mask = 0;
4320    unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL);
4321
4322    assert(depth || stencil || samplemask);
4323
4324    memset(args, 0, sizeof(*args));
4325
4326    args->valid_mask = 1; /* whether the EXEC mask is valid */
4327    args->done = 1;       /* DONE bit */
4328
4329    /* Specify the target we are exporting */
4330    args->target = V_008DFC_SQ_EXP_MRTZ;
4331
4332    args->compr = 0;                       /* COMP flag */
4333    args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4334    args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4335    args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4336    args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4337
4338    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4339       assert(!depth);
4340       args->compr = 1; /* COMPR flag */
4341
4342       if (stencil) {
4343          /* Stencil should be in X[23:16]. */
4344          stencil = ac_to_integer(ctx, stencil);
4345          stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4346          args->out[0] = ac_to_float(ctx, stencil);
4347          mask |= 0x3;
4348       }
4349       if (samplemask) {
4350          /* SampleMask should be in Y[15:0]. */
4351          args->out[1] = samplemask;
4352          mask |= 0xc;
4353       }
4354    } else {
4355       if (depth) {
4356          args->out[0] = depth;
4357          mask |= 0x1;
4358       }
4359       if (stencil) {
4360          args->out[1] = stencil;
4361          mask |= 0x2;
4362       }
4363       if (samplemask) {
4364          args->out[2] = samplemask;
4365          mask |= 0x4;
4366       }
4367    }
4368
4369    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4370     * at the X writemask component. */
4371    if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4372       mask |= 0x1;
4373
4374    /* Specify which components to enable */
4375    args->enabled_channels = mask;
4376 }
4377
4378 /* Send GS Alloc Req message from the first wave of the group to SPI.
4379  * Message payload is:
4380  * - bits 0..10: vertices in group
4381  * - bits 12..22: primitives in group
4382  */
4383 void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4384                                    LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4385 {
4386    LLVMBuilderRef builder = ctx->builder;
4387    LLVMValueRef tmp;
4388    bool export_dummy_prim = false;
4389
4390    /* HW workaround for a GPU hang with 100% culling.
4391     * We always have to export at least 1 primitive.
4392     * Export a degenerate triangle using vertex 0 for all 3 vertices.
4393     */
4394    if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
4395       assert(vtx_cnt == ctx->i32_0);
4396       prim_cnt = ctx->i32_1;
4397       vtx_cnt = ctx->i32_1;
4398       export_dummy_prim = true;
4399    }
4400
4401    ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4402
4403    tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4404    tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4405    ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4406
4407    if (export_dummy_prim) {
4408       struct ac_ngg_prim prim = {};
4409       /* The vertex indices are 0,0,0. */
4410       prim.passthrough = ctx->i32_0;
4411
4412       struct ac_export_args pos = {};
4413       pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0;
4414       pos.target = V_008DFC_SQ_EXP_POS;
4415       pos.enabled_channels = 0xf;
4416       pos.done = true;
4417
4418       ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4419                     5021);
4420       ac_build_export_prim(ctx, &prim);
4421       ac_build_export(ctx, &pos);
4422       ac_build_endif(ctx, 5021);
4423    }
4424
4425    ac_build_endif(ctx, 5020);
4426 }
4427
4428 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4429 {
4430    /* The prim export format is:
4431     *  - bits 0..8: index 0
4432     *  - bit 9: edge flag 0
4433     *  - bits 10..18: index 1
4434     *  - bit 19: edge flag 1
4435     *  - bits 20..28: index 2
4436     *  - bit 29: edge flag 2
4437     *  - bit 31: null primitive (skip)
4438     */
4439    LLVMBuilderRef builder = ctx->builder;
4440    LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4441    LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4442
4443    for (unsigned i = 0; i < prim->num_vertices; ++i) {
4444       tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4445       result = LLVMBuildOr(builder, result, tmp, "");
4446       tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");
4447       tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), "");
4448       result = LLVMBuildOr(builder, result, tmp, "");
4449    }
4450    return result;
4451 }
4452
4453 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4454 {
4455    struct ac_export_args args;
4456
4457    if (prim->passthrough) {
4458       args.out[0] = prim->passthrough;
4459    } else {
4460       args.out[0] = ac_pack_prim_export(ctx, prim);
4461    }
4462
4463    args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4464    args.out[1] = LLVMGetUndef(ctx->f32);
4465    args.out[2] = LLVMGetUndef(ctx->f32);
4466    args.out[3] = LLVMGetUndef(ctx->f32);
4467
4468    args.target = V_008DFC_SQ_EXP_PRIM;
4469    args.enabled_channels = 1;
4470    args.done = true;
4471    args.valid_mask = false;
4472    args.compr = false;
4473
4474    ac_build_export(ctx, &args);
4475 }
4476
4477 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4478 {
4479    if (type == AC_ARG_FLOAT) {
4480       return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4481    } else if (type == AC_ARG_INT) {
4482       return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4483    } else {
4484       LLVMTypeRef ptr_type;
4485       switch (type) {
4486       case AC_ARG_CONST_PTR:
4487          ptr_type = ctx->i8;
4488          break;
4489       case AC_ARG_CONST_FLOAT_PTR:
4490          ptr_type = ctx->f32;
4491          break;
4492       case AC_ARG_CONST_PTR_PTR:
4493          ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4494          break;
4495       case AC_ARG_CONST_DESC_PTR:
4496          ptr_type = ctx->v4i32;
4497          break;
4498       case AC_ARG_CONST_IMAGE_PTR:
4499          ptr_type = ctx->v8i32;
4500          break;
4501       default:
4502          unreachable("unknown arg type");
4503       }
4504       if (size == 1) {
4505          return ac_array_in_const32_addr_space(ptr_type);
4506       } else {
4507          assert(size == 2);
4508          return ac_array_in_const_addr_space(ptr_type);
4509       }
4510    }
4511 }
4512
4513 LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4514                            enum ac_llvm_calling_convention convention, const char *name,
4515                            LLVMTypeRef ret_type, LLVMModuleRef module)
4516 {
4517    LLVMTypeRef arg_types[AC_MAX_ARGS];
4518
4519    for (unsigned i = 0; i < args->arg_count; i++) {
4520       arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4521    }
4522
4523    LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4524
4525    LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4526    LLVMBasicBlockRef main_function_body =
4527       LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4528    LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4529
4530    LLVMSetFunctionCallConv(main_function, convention);
4531    for (unsigned i = 0; i < args->arg_count; ++i) {
4532       LLVMValueRef P = LLVMGetParam(main_function, i);
4533
4534       if (args->args[i].file != AC_ARG_SGPR)
4535          continue;
4536
4537       ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4538
4539       if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4540          ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4541          ac_add_attr_dereferenceable(P, UINT64_MAX);
4542          ac_add_attr_alignment(P, 32);
4543       }
4544    }
4545
4546    ctx->main_function = main_function;
4547
4548    if (LLVM_VERSION_MAJOR >= 11) {
4549       /* Enable denormals for FP16 and FP64: */
4550       LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4551       /* Disable denormals for FP32: */
4552       LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4553                                          "preserve-sign,preserve-sign");
4554    }
4555    return main_function;
4556 }
4557
4558 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4559 {
4560    LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4561    LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4562    LLVMBuildCall(ctx->builder, code, NULL, 0, "");
4563 }
4564
4565 LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef index)
4566 {
4567    LLVMBuilderRef builder = ctx->builder;
4568    LLVMTypeRef type = LLVMTypeOf(mask);
4569
4570    LLVMValueRef bit =
4571       LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), LLVMBuildZExt(builder, index, type, ""), "");
4572    LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), "");
4573    LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, "");
4574    return ac_build_bit_count(ctx, prefix_mask);
4575 }
4576
4577 /* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */
4578 LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, LLVMValueRef mask[2],
4579                                      LLVMValueRef index)
4580 {
4581    LLVMBuilderRef builder = ctx->builder;
4582 #if 0
4583         /* Reference version using i128. */
4584         LLVMValueRef input_mask =
4585                 LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, "");
4586
4587         return ac_prefix_bitcount(ctx, input_mask, index);
4588 #else
4589    /* Optimized version using 2 64-bit masks. */
4590    LLVMValueRef is_hi, is_0, c64, c128, all_bits;
4591    LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2];
4592
4593    /* Compute the 128-bit prefix mask. */
4594    c64 = LLVMConstInt(ctx->i32, 64, 0);
4595    c128 = LLVMConstInt(ctx->i32, 128, 0);
4596    all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4597    /* The first index that can have non-zero high bits in the prefix mask is 65. */
4598    is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, "");
4599    is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, "");
4600    mask_bcnt0 = ac_build_bit_count(ctx, mask[0]);
4601
4602    for (unsigned i = 0; i < 2; i++) {
4603       shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, "");
4604       /* For i==0, index==0, the right shift by 64 doesn't give the desired result,
4605        * so we handle it by the is_0 select.
4606        * For i==1, index==64, same story, so we handle it by the last is_hi select.
4607        * For i==0, index==64, we shift by 0, which is what we want.
4608        */
4609       prefix_mask[i] =
4610          LLVMBuildLShr(builder, all_bits, LLVMBuildZExt(builder, shift[i], ctx->i64, ""), "");
4611       prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], "");
4612       prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]);
4613    }
4614
4615    prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], "");
4616    prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], "");
4617    prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, "");
4618
4619    return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], "");
4620 #endif
4621 }
4622
4623 /**
4624  * Convert triangle strip indices to triangle indices. This is used to decompose
4625  * triangle strips into triangles.
4626  */
4627 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4628                                                  LLVMValueRef flatshade_first,
4629                                                  LLVMValueRef index[3])
4630 {
4631    LLVMBuilderRef builder = ctx->builder;
4632    LLVMValueRef out[3];
4633
4634    /* We need to change the vertex order for odd triangles to get correct
4635     * front/back facing by swapping 2 vertex indices, but we also have to
4636     * keep the provoking vertex in the same place.
4637     *
4638     * If the first vertex is provoking, swap index 1 and 2.
4639     * If the last vertex is provoking, swap index 0 and 1.
4640     */
4641    out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4642                             LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4643    out[1] = LLVMBuildSelect(builder, flatshade_first,
4644                             LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4645                             LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4646    out[2] = LLVMBuildSelect(builder, flatshade_first,
4647                             LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4648    memcpy(index, out, sizeof(out));
4649 }