src/amd/common/ac_llvm_build.c

   1 /*
   2  * Copyright 2014 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sub license, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  19  *
  20  * The above copyright notice and this permission notice (including the
  21  * next paragraph) shall be included in all copies or substantial portions
  22  * of the Software.
  23  *
  24  */
  25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
  26 #include "ac_llvm_build.h"
  27
  28 #include <llvm-c/Core.h>
  29
  30 #include "c11/threads.h"
  31
  32 #include <assert.h>
  33 #include <stdio.h>
  34
  35 #include "ac_llvm_util.h"
  36 #include "ac_exp_param.h"
  37 #include "util/bitscan.h"
  38 #include "util/macros.h"
  39 #include "util/u_atomic.h"
  40 #include "util/u_math.h"
  41 #include "sid.h"
  42
  43 #include "shader_enums.h"
  44
  45 #define AC_LLVM_INITIAL_CF_DEPTH 4
  46
  47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
  48  */
  49 struct ac_llvm_flow {
  50         /* Loop exit or next part of if/else/endif. */
  51         LLVMBasicBlockRef next_block;
  52         LLVMBasicBlockRef loop_entry_block;
  53 };
  54
  55 /* Initialize module-independent parts of the context.
  56  *
  57  * The caller is responsible for initializing ctx::module and ctx::builder.
  58  */
  59 void
  60 ac_llvm_context_init(struct ac_llvm_context *ctx,
  61                      struct ac_llvm_compiler *compiler,
  62                      enum chip_class chip_class, enum radeon_family family,
  63                      unsigned wave_size)
  64 {
  65         LLVMValueRef args[1];
  66
  67         ctx->context = LLVMContextCreate();
  68
  69         ctx->chip_class = chip_class;
  70         ctx->family = family;
  71         ctx->wave_size = wave_size;
  72         ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32
  73                                                        : compiler->tm,
  74                                        ctx->context);
  75         ctx->builder = NULL;
  76
  77         ctx->voidt = LLVMVoidTypeInContext(ctx->context);
  78         ctx->i1 = LLVMInt1TypeInContext(ctx->context);
  79         ctx->i8 = LLVMInt8TypeInContext(ctx->context);
  80         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
  81         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
  82         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
  83         ctx->intptr = ctx->i32;
  84         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
  85         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
  86         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
  87         ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
  88         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
  89         ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
  90         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
  91         ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
  92         ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
  93         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  94         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  95
  96         ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
  97         ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
  98         ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
  99         ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
 100         ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
 101         ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
 102         ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
 103         ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
 104         ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
 105         ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
 106         ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
 107         ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 108         ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
 109         ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
 110
 111         ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
 112         ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
 113
 114         ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 115                                                      "range", 5);
 116
 117         ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 118                                                                "invariant.load", 14);
 119
 120         ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
 121
 122         args[0] = LLVMConstReal(ctx->f32, 2.5);
 123         ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
 124
 125         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 126                                                         "amdgpu.uniform", 14);
 127
 128         ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
 129 }
 130
 131 void
 132 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
 133 {
 134         free(ctx->flow);
 135         ctx->flow = NULL;
 136         ctx->flow_depth_max = 0;
 137 }
 138
 139 int
 140 ac_get_llvm_num_components(LLVMValueRef value)
 141 {
 142         LLVMTypeRef type = LLVMTypeOf(value);
 143         unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
 144                                       ? LLVMGetVectorSize(type)
 145                                       : 1;
 146         return num_components;
 147 }
 148
 149 LLVMValueRef
 150 ac_llvm_extract_elem(struct ac_llvm_context *ac,
 151                      LLVMValueRef value,
 152                      int index)
 153 {
 154         if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
 155                 assert(index == 0);
 156                 return value;
 157         }
 158
 159         return LLVMBuildExtractElement(ac->builder, value,
 160                                        LLVMConstInt(ac->i32, index, false), "");
 161 }
 162
 163 int
 164 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
 165 {
 166         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 167                 type = LLVMGetElementType(type);
 168
 169         if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
 170                 return LLVMGetIntTypeWidth(type);
 171
 172         if (type == ctx->f16)
 173                 return 16;
 174         if (type == ctx->f32)
 175                 return 32;
 176         if (type == ctx->f64)
 177                 return 64;
 178
 179         unreachable("Unhandled type kind in get_elem_bits");
 180 }
 181
 182 unsigned
 183 ac_get_type_size(LLVMTypeRef type)
 184 {
 185         LLVMTypeKind kind = LLVMGetTypeKind(type);
 186
 187         switch (kind) {
 188         case LLVMIntegerTypeKind:
 189                 return LLVMGetIntTypeWidth(type) / 8;
 190         case LLVMHalfTypeKind:
 191                 return 2;
 192         case LLVMFloatTypeKind:
 193                 return 4;
 194         case LLVMDoubleTypeKind:
 195                 return 8;
 196         case LLVMPointerTypeKind:
 197                 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
 198                         return 4;
 199                 return 8;
 200         case LLVMVectorTypeKind:
 201                 return LLVMGetVectorSize(type) *
 202                        ac_get_type_size(LLVMGetElementType(type));
 203         case LLVMArrayTypeKind:
 204                 return LLVMGetArrayLength(type) *
 205                        ac_get_type_size(LLVMGetElementType(type));
 206         default:
 207                 assert(0);
 208                 return 0;
 209         }
 210 }
 211
 212 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 213 {
 214         if (t == ctx->i8)
 215                 return ctx->i8;
 216         else if (t == ctx->f16 || t == ctx->i16)
 217                 return ctx->i16;
 218         else if (t == ctx->f32 || t == ctx->i32)
 219                 return ctx->i32;
 220         else if (t == ctx->f64 || t == ctx->i64)
 221                 return ctx->i64;
 222         else
 223                 unreachable("Unhandled integer size");
 224 }
 225
 226 LLVMTypeRef
 227 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 228 {
 229         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 230                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 231                 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
 232                                       LLVMGetVectorSize(t));
 233         }
 234         if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
 235                 switch (LLVMGetPointerAddressSpace(t)) {
 236                 case AC_ADDR_SPACE_GLOBAL:
 237                         return ctx->i64;
 238                 case AC_ADDR_SPACE_LDS:
 239                         return ctx->i32;
 240                 default:
 241                         unreachable("unhandled address space");
 242                 }
 243         }
 244         return to_integer_type_scalar(ctx, t);
 245 }
 246
 247 LLVMValueRef
 248 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 249 {
 250         LLVMTypeRef type = LLVMTypeOf(v);
 251         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
 252                 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 253         }
 254         return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 255 }
 256
 257 LLVMValueRef
 258 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
 259 {
 260         LLVMTypeRef type = LLVMTypeOf(v);
 261         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
 262                 return v;
 263         return ac_to_integer(ctx, v);
 264 }
 265
 266 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 267 {
 268         if (t == ctx->i8)
 269                 return ctx->i8;
 270         else if (t == ctx->i16 || t == ctx->f16)
 271                 return ctx->f16;
 272         else if (t == ctx->i32 || t == ctx->f32)
 273                 return ctx->f32;
 274         else if (t == ctx->i64 || t == ctx->f64)
 275                 return ctx->f64;
 276         else
 277                 unreachable("Unhandled float size");
 278 }
 279
 280 LLVMTypeRef
 281 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 282 {
 283         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 284                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 285                 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
 286                                       LLVMGetVectorSize(t));
 287         }
 288         return to_float_type_scalar(ctx, t);
 289 }
 290
 291 LLVMValueRef
 292 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
 293 {
 294         LLVMTypeRef type = LLVMTypeOf(v);
 295         return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
 296 }
 297
 298
 299 LLVMValueRef
 300 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
 301                    LLVMTypeRef return_type, LLVMValueRef *params,
 302                    unsigned param_count, unsigned attrib_mask)
 303 {
 304         LLVMValueRef function, call;
 305         bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 306
 307         function = LLVMGetNamedFunction(ctx->module, name);
 308         if (!function) {
 309                 LLVMTypeRef param_types[32], function_type;
 310                 unsigned i;
 311
 312                 assert(param_count <= 32);
 313
 314                 for (i = 0; i < param_count; ++i) {
 315                         assert(params[i]);
 316                         param_types[i] = LLVMTypeOf(params[i]);
 317                 }
 318                 function_type =
 319                     LLVMFunctionType(return_type, param_types, param_count, 0);
 320                 function = LLVMAddFunction(ctx->module, name, function_type);
 321
 322                 LLVMSetFunctionCallConv(function, LLVMCCallConv);
 323                 LLVMSetLinkage(function, LLVMExternalLinkage);
 324
 325                 if (!set_callsite_attrs)
 326                         ac_add_func_attributes(ctx->context, function, attrib_mask);
 327         }
 328
 329         call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
 330         if (set_callsite_attrs)
 331                 ac_add_func_attributes(ctx->context, call, attrib_mask);
 332         return call;
 333 }
 334
 335 /**
 336  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
 337  * intrinsic names).
 338  */
 339 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
 340 {
 341         LLVMTypeRef elem_type = type;
 342
 343         assert(bufsize >= 8);
 344
 345         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
 346                 int ret = snprintf(buf, bufsize, "v%u",
 347                                         LLVMGetVectorSize(type));
 348                 if (ret < 0) {
 349                         char *type_name = LLVMPrintTypeToString(type);
 350                         fprintf(stderr, "Error building type name for: %s\n",
 351                                 type_name);
 352                         return;
 353                 }
 354                 elem_type = LLVMGetElementType(type);
 355                 buf += ret;
 356                 bufsize -= ret;
 357         }
 358         switch (LLVMGetTypeKind(elem_type)) {
 359         default: break;
 360         case LLVMIntegerTypeKind:
 361                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 362                 break;
 363         case LLVMHalfTypeKind:
 364                 snprintf(buf, bufsize, "f16");
 365                 break;
 366         case LLVMFloatTypeKind:
 367                 snprintf(buf, bufsize, "f32");
 368                 break;
 369         case LLVMDoubleTypeKind:
 370                 snprintf(buf, bufsize, "f64");
 371                 break;
 372         }
 373 }
 374
 375 /**
 376  * Helper function that builds an LLVM IR PHI node and immediately adds
 377  * incoming edges.
 378  */
 379 LLVMValueRef
 380 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
 381              unsigned count_incoming, LLVMValueRef *values,
 382              LLVMBasicBlockRef *blocks)
 383 {
 384         LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
 385         LLVMAddIncoming(phi, values, blocks, count_incoming);
 386         return phi;
 387 }
 388
 389 void ac_build_s_barrier(struct ac_llvm_context *ctx)
 390 {
 391         ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
 392                            0, AC_FUNC_ATTR_CONVERGENT);
 393 }
 394
 395 /* Prevent optimizations (at least of memory accesses) across the current
 396  * point in the program by emitting empty inline assembly that is marked as
 397  * having side effects.
 398  *
 399  * Optionally, a value can be passed through the inline assembly to prevent
 400  * LLVM from hoisting calls to ReadNone functions.
 401  */
 402 void
 403 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
 404                               LLVMValueRef *pvgpr)
 405 {
 406         static int counter = 0;
 407
 408         LLVMBuilderRef builder = ctx->builder;
 409         char code[16];
 410
 411         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
 412
 413         if (!pvgpr) {
 414                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
 415                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
 416                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
 417         } else {
 418                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
 419                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
 420                 LLVMValueRef vgpr = *pvgpr;
 421                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
 422                 unsigned vgpr_size = ac_get_type_size(vgpr_type);
 423                 LLVMValueRef vgpr0;
 424
 425                 assert(vgpr_size % 4 == 0);
 426
 427                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
 428                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
 429                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
 430                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
 431                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
 432
 433                 *pvgpr = vgpr;
 434         }
 435 }
 436
 437 LLVMValueRef
 438 ac_build_shader_clock(struct ac_llvm_context *ctx)
 439 {
 440         const char *intr = HAVE_LLVM >= 0x0900 && ctx->chip_class >= GFX8 ?
 441                                 "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter";
 442         LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0);
 443         return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
 444 }
 445
 446 LLVMValueRef
 447 ac_build_ballot(struct ac_llvm_context *ctx,
 448                 LLVMValueRef value)
 449 {
 450         const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i32" : "llvm.amdgcn.icmp.i32";
 451         LLVMValueRef args[3] = {
 452                 value,
 453                 ctx->i32_0,
 454                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
 455         };
 456
 457         /* We currently have no other way to prevent LLVM from lifting the icmp
 458          * calls to a dominating basic block.
 459          */
 460         ac_build_optimization_barrier(ctx, &args[0]);
 461
 462         args[0] = ac_to_integer(ctx, args[0]);
 463
 464         return ac_build_intrinsic(ctx, name,
 465                                   ctx->i64, args, 3,
 466                                   AC_FUNC_ATTR_NOUNWIND |
 467                                   AC_FUNC_ATTR_READNONE |
 468                                   AC_FUNC_ATTR_CONVERGENT);
 469 }
 470
 471 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
 472                                  LLVMValueRef value)
 473 {
 474         const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
 475         LLVMValueRef args[3] = {
 476                 value,
 477                 ctx->i1false,
 478                 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
 479         };
 480
 481         assert(HAVE_LLVM >= 0x0800);
 482         return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
 483                                   AC_FUNC_ATTR_NOUNWIND |
 484                                   AC_FUNC_ATTR_READNONE |
 485                                   AC_FUNC_ATTR_CONVERGENT);
 486 }
 487
 488 LLVMValueRef
 489 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
 490 {
 491         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 492         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 493         return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
 494 }
 495
 496 LLVMValueRef
 497 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
 498 {
 499         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 500         return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
 501                              LLVMConstInt(ctx->i64, 0, 0), "");
 502 }
 503
 504 LLVMValueRef
 505 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
 506 {
 507         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 508         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 509
 510         LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 511                                          vote_set, active_set, "");
 512         LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 513                                           vote_set,
 514                                           LLVMConstInt(ctx->i64, 0, 0), "");
 515         return LLVMBuildOr(ctx->builder, all, none, "");
 516 }
 517
 518 LLVMValueRef
 519 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
 520                                unsigned value_count, unsigned component)
 521 {
 522         LLVMValueRef vec = NULL;
 523
 524         if (value_count == 1) {
 525                 return values[component];
 526         } else if (!value_count)
 527                 unreachable("value_count is 0");
 528
 529         for (unsigned i = component; i < value_count + component; i++) {
 530                 LLVMValueRef value = values[i];
 531
 532                 if (i == component)
 533                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 534                 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
 535                 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
 536         }
 537         return vec;
 538 }
 539
 540 LLVMValueRef
 541 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 542                                 LLVMValueRef *values,
 543                                 unsigned value_count,
 544                                 unsigned value_stride,
 545                                 bool load,
 546                                 bool always_vector)
 547 {
 548         LLVMBuilderRef builder = ctx->builder;
 549         LLVMValueRef vec = NULL;
 550         unsigned i;
 551
 552         if (value_count == 1 && !always_vector) {
 553                 if (load)
 554                         return LLVMBuildLoad(builder, values[0], "");
 555                 return values[0];
 556         } else if (!value_count)
 557                 unreachable("value_count is 0");
 558
 559         for (i = 0; i < value_count; i++) {
 560                 LLVMValueRef value = values[i * value_stride];
 561                 if (load)
 562                         value = LLVMBuildLoad(builder, value, "");
 563
 564                 if (!i)
 565                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 566                 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
 567                 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
 568         }
 569         return vec;
 570 }
 571
 572 LLVMValueRef
 573 ac_build_gather_values(struct ac_llvm_context *ctx,
 574                        LLVMValueRef *values,
 575                        unsigned value_count)
 576 {
 577         return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 578 }
 579
 580 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
 581  * channels with undef. Extract at most src_channels components from the input.
 582  */
 583 static LLVMValueRef
 584 ac_build_expand(struct ac_llvm_context *ctx,
 585                 LLVMValueRef value,
 586                 unsigned src_channels,
 587                 unsigned dst_channels)
 588 {
 589         LLVMTypeRef elemtype;
 590         LLVMValueRef chan[dst_channels];
 591
 592         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
 593                 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
 594
 595                 if (src_channels == dst_channels && vec_size == dst_channels)
 596                         return value;
 597
 598                 src_channels = MIN2(src_channels, vec_size);
 599
 600                 for (unsigned i = 0; i < src_channels; i++)
 601                         chan[i] = ac_llvm_extract_elem(ctx, value, i);
 602
 603                 elemtype = LLVMGetElementType(LLVMTypeOf(value));
 604         } else {
 605                 if (src_channels) {
 606                         assert(src_channels == 1);
 607                         chan[0] = value;
 608                 }
 609                 elemtype = LLVMTypeOf(value);
 610         }
 611
 612         for (unsigned i = src_channels; i < dst_channels; i++)
 613                 chan[i] = LLVMGetUndef(elemtype);
 614
 615         return ac_build_gather_values(ctx, chan, dst_channels);
 616 }
 617
 618 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
 619  * with undef. Extract at most num_channels components from the input.
 620  */
 621 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
 622                                      LLVMValueRef value,
 623                                      unsigned num_channels)
 624 {
 625         return ac_build_expand(ctx, value, num_channels, 4);
 626 }
 627
 628 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
 629 {
 630         unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
 631         const char *name;
 632
 633         if (type_size == 2)
 634                 name = "llvm.rint.f16";
 635         else if (type_size == 4)
 636                 name = "llvm.rint.f32";
 637         else
 638                 name = "llvm.rint.f64";
 639
 640         return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
 641                                   AC_FUNC_ATTR_READNONE);
 642 }
 643
 644 LLVMValueRef
 645 ac_build_fdiv(struct ac_llvm_context *ctx,
 646               LLVMValueRef num,
 647               LLVMValueRef den)
 648 {
 649         /* If we do (num / den), LLVM >= 7.0 does:
 650          *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
 651          *
 652          * If we do (num * (1 / den)), LLVM does:
 653          *    return num * v_rcp_f32(den);
 654          */
 655         LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
 656         LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
 657         LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 658
 659         /* Use v_rcp_f32 instead of precise division. */
 660         if (!LLVMIsConstant(ret))
 661                 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
 662         return ret;
 663 }
 664
 665 /* See fast_idiv_by_const.h. */
 666 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
 667 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
 668                                 LLVMValueRef num,
 669                                 LLVMValueRef multiplier,
 670                                 LLVMValueRef pre_shift,
 671                                 LLVMValueRef post_shift,
 672                                 LLVMValueRef increment)
 673 {
 674         LLVMBuilderRef builder = ctx->builder;
 675
 676         num = LLVMBuildLShr(builder, num, pre_shift, "");
 677         num = LLVMBuildMul(builder,
 678                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 679                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 680         num = LLVMBuildAdd(builder, num,
 681                            LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
 682         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 683         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 684         return LLVMBuildLShr(builder, num, post_shift, "");
 685 }
 686
 687 /* See fast_idiv_by_const.h. */
 688 /* If num != UINT_MAX, this more efficient version can be used. */
 689 /* Set: increment = util_fast_udiv_info::increment; */
 690 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
 691                                     LLVMValueRef num,
 692                                     LLVMValueRef multiplier,
 693                                     LLVMValueRef pre_shift,
 694                                     LLVMValueRef post_shift,
 695                                     LLVMValueRef increment)
 696 {
 697         LLVMBuilderRef builder = ctx->builder;
 698
 699         num = LLVMBuildLShr(builder, num, pre_shift, "");
 700         num = LLVMBuildNUWAdd(builder, num, increment, "");
 701         num = LLVMBuildMul(builder,
 702                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 703                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 704         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 705         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 706         return LLVMBuildLShr(builder, num, post_shift, "");
 707 }
 708
 709 /* See fast_idiv_by_const.h. */
 710 /* Both operands must fit in 31 bits and the divisor must not be 1. */
 711 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
 712                                               LLVMValueRef num,
 713                                               LLVMValueRef multiplier,
 714                                               LLVMValueRef post_shift)
 715 {
 716         LLVMBuilderRef builder = ctx->builder;
 717
 718         num = LLVMBuildMul(builder,
 719                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 720                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 721         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 722         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 723         return LLVMBuildLShr(builder, num, post_shift, "");
 724 }
 725
 726 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
 727  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
 728  * already multiplied by two. id is the cube face number.
 729  */
 730 struct cube_selection_coords {
 731         LLVMValueRef stc[2];
 732         LLVMValueRef ma;
 733         LLVMValueRef id;
 734 };
 735
 736 static void
 737 build_cube_intrinsic(struct ac_llvm_context *ctx,
 738                      LLVMValueRef in[3],
 739                      struct cube_selection_coords *out)
 740 {
 741         LLVMTypeRef f32 = ctx->f32;
 742
 743         out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
 744                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 745         out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
 746                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 747         out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
 748                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 749         out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
 750                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 751 }
 752
 753 /**
 754  * Build a manual selection sequence for cube face sc/tc coordinates and
 755  * major axis vector (multiplied by 2 for consistency) for the given
 756  * vec3 \p coords, for the face implied by \p selcoords.
 757  *
 758  * For the major axis, we always adjust the sign to be in the direction of
 759  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
 760  * the selcoords major axis.
 761  */
 762 static void build_cube_select(struct ac_llvm_context *ctx,
 763                               const struct cube_selection_coords *selcoords,
 764                               const LLVMValueRef *coords,
 765                               LLVMValueRef *out_st,
 766                               LLVMValueRef *out_ma)
 767 {
 768         LLVMBuilderRef builder = ctx->builder;
 769         LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
 770         LLVMValueRef is_ma_positive;
 771         LLVMValueRef sgn_ma;
 772         LLVMValueRef is_ma_z, is_not_ma_z;
 773         LLVMValueRef is_ma_y;
 774         LLVMValueRef is_ma_x;
 775         LLVMValueRef sgn;
 776         LLVMValueRef tmp;
 777
 778         is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
 779                 selcoords->ma, LLVMConstReal(f32, 0.0), "");
 780         sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
 781                 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
 782
 783         is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
 784         is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
 785         is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
 786                 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
 787         is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 788
 789         /* Select sc */
 790         tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
 791         sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
 792                 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
 793                         LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 794         out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 795
 796         /* Select tc */
 797         tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
 798         sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
 799                 LLVMConstReal(f32, -1.0), "");
 800         out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 801
 802         /* Select ma */
 803         tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
 804                 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
 805         tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 806                                  ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
 807         *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
 808 }
 809
 810 void
 811 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
 812                        bool is_deriv, bool is_array, bool is_lod,
 813                        LLVMValueRef *coords_arg,
 814                        LLVMValueRef *derivs_arg)
 815 {
 816
 817         LLVMBuilderRef builder = ctx->builder;
 818         struct cube_selection_coords selcoords;
 819         LLVMValueRef coords[3];
 820         LLVMValueRef invma;
 821
 822         if (is_array && !is_lod) {
 823                 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
 824
 825                 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
 826                  *
 827                  *    "For Array forms, the array layer used will be
 828                  *
 829                  *       max(0, min(d−1, floor(layer+0.5)))
 830                  *
 831                  *     where d is the depth of the texture array and layer
 832                  *     comes from the component indicated in the tables below.
 833                  *     Workaroudn for an issue where the layer is taken from a
 834                  *     helper invocation which happens to fall on a different
 835                  *     layer due to extrapolation."
 836                  *
 837                  * GFX8 and earlier attempt to implement this in hardware by
 838                  * clamping the value of coords[2] = (8 * layer) + face.
 839                  * Unfortunately, this means that the we end up with the wrong
 840                  * face when clamping occurs.
 841                  *
 842                  * Clamp the layer earlier to work around the issue.
 843                  */
 844                 if (ctx->chip_class <= GFX8) {
 845                         LLVMValueRef ge0;
 846                         ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
 847                         tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
 848                 }
 849
 850                 coords_arg[3] = tmp;
 851         }
 852
 853         build_cube_intrinsic(ctx, coords_arg, &selcoords);
 854
 855         invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 856                         ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
 857         invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
 858
 859         for (int i = 0; i < 2; ++i)
 860                 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
 861
 862         coords[2] = selcoords.id;
 863
 864         if (is_deriv && derivs_arg) {
 865                 LLVMValueRef derivs[4];
 866                 int axis;
 867
 868                 /* Convert cube derivatives to 2D derivatives. */
 869                 for (axis = 0; axis < 2; axis++) {
 870                         LLVMValueRef deriv_st[2];
 871                         LLVMValueRef deriv_ma;
 872
 873                         /* Transform the derivative alongside the texture
 874                          * coordinate. Mathematically, the correct formula is
 875                          * as follows. Assume we're projecting onto the +Z face
 876                          * and denote by dx/dh the derivative of the (original)
 877                          * X texture coordinate with respect to horizontal
 878                          * window coordinates. The projection onto the +Z face
 879                          * plane is:
 880                          *
 881                          *   f(x,z) = x/z
 882                          *
 883                          * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
 884                          *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
 885                          *
 886                          * This motivatives the implementation below.
 887                          *
 888                          * Whether this actually gives the expected results for
 889                          * apps that might feed in derivatives obtained via
 890                          * finite differences is anyone's guess. The OpenGL spec
 891                          * seems awfully quiet about how textureGrad for cube
 892                          * maps should be handled.
 893                          */
 894                         build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
 895                                           deriv_st, &deriv_ma);
 896
 897                         deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
 898
 899                         for (int i = 0; i < 2; ++i)
 900                                 derivs[axis * 2 + i] =
 901                                         LLVMBuildFSub(builder,
 902                                                 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
 903                                                 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
 904                 }
 905
 906                 memcpy(derivs_arg, derivs, sizeof(derivs));
 907         }
 908
 909         /* Shift the texture coordinate. This must be applied after the
 910          * derivative calculation.
 911          */
 912         for (int i = 0; i < 2; ++i)
 913                 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
 914
 915         if (is_array) {
 916                 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 917                 /* coords_arg.w component - array_index for cube arrays */
 918                 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
 919         }
 920
 921         memcpy(coords_arg, coords, sizeof(coords));
 922 }
 923
 924
 925 LLVMValueRef
 926 ac_build_fs_interp(struct ac_llvm_context *ctx,
 927                    LLVMValueRef llvm_chan,
 928                    LLVMValueRef attr_number,
 929                    LLVMValueRef params,
 930                    LLVMValueRef i,
 931                    LLVMValueRef j)
 932 {
 933         LLVMValueRef args[5];
 934         LLVMValueRef p1;
 935
 936         args[0] = i;
 937         args[1] = llvm_chan;
 938         args[2] = attr_number;
 939         args[3] = params;
 940
 941         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
 942                                 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 943
 944         args[0] = p1;
 945         args[1] = j;
 946         args[2] = llvm_chan;
 947         args[3] = attr_number;
 948         args[4] = params;
 949
 950         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
 951                                   ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 952 }
 953
 954 LLVMValueRef
 955 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
 956                        LLVMValueRef llvm_chan,
 957                        LLVMValueRef attr_number,
 958                        LLVMValueRef params,
 959                        LLVMValueRef i,
 960                        LLVMValueRef j)
 961 {
 962         LLVMValueRef args[6];
 963         LLVMValueRef p1;
 964
 965         args[0] = i;
 966         args[1] = llvm_chan;
 967         args[2] = attr_number;
 968         args[3] = ctx->i1false;
 969         args[4] = params;
 970
 971         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
 972                                 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 973
 974         args[0] = p1;
 975         args[1] = j;
 976         args[2] = llvm_chan;
 977         args[3] = attr_number;
 978         args[4] = ctx->i1false;
 979         args[5] = params;
 980
 981         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
 982                                   ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
 983 }
 984
 985 LLVMValueRef
 986 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 987                        LLVMValueRef parameter,
 988                        LLVMValueRef llvm_chan,
 989                        LLVMValueRef attr_number,
 990                        LLVMValueRef params)
 991 {
 992         LLVMValueRef args[4];
 993
 994         args[0] = parameter;
 995         args[1] = llvm_chan;
 996         args[2] = attr_number;
 997         args[3] = params;
 998
 999         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
1000                                   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1001 }
1002
1003 LLVMValueRef
1004 ac_build_gep_ptr(struct ac_llvm_context *ctx,
1005                  LLVMValueRef base_ptr,
1006                  LLVMValueRef index)
1007 {
1008         return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1009 }
1010
1011 LLVMValueRef
1012 ac_build_gep0(struct ac_llvm_context *ctx,
1013               LLVMValueRef base_ptr,
1014               LLVMValueRef index)
1015 {
1016         LLVMValueRef indices[2] = {
1017                 ctx->i32_0,
1018                 index,
1019         };
1020         return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1021 }
1022
1023 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
1024                                   LLVMValueRef index)
1025 {
1026         return LLVMBuildPointerCast(ctx->builder,
1027                                     LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1028                                     LLVMTypeOf(ptr), "");
1029 }
1030
1031 void
1032 ac_build_indexed_store(struct ac_llvm_context *ctx,
1033                        LLVMValueRef base_ptr, LLVMValueRef index,
1034                        LLVMValueRef value)
1035 {
1036         LLVMBuildStore(ctx->builder, value,
1037                        ac_build_gep0(ctx, base_ptr, index));
1038 }
1039
1040 /**
1041  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1042  * It's equivalent to doing a load from &base_ptr[index].
1043  *
1044  * \param base_ptr  Where the array starts.
1045  * \param index     The element index into the array.
1046  * \param uniform   Whether the base_ptr and index can be assumed to be
1047  *                  dynamically uniform (i.e. load to an SGPR)
1048  * \param invariant Whether the load is invariant (no other opcodes affect it)
1049  * \param no_unsigned_wraparound
1050  *    For all possible re-associations and re-distributions of an expression
1051  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1052  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1053  *    does not result in an unsigned integer wraparound. This is used for
1054  *    optimal code generation of 32-bit pointer arithmetic.
1055  *
1056  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1057  *    integer wraparound can't be an imm offset in s_load_dword, because
1058  *    the instruction performs "addr + offset" in 64 bits.
1059  *
1060  *    Expected usage for bindless textures by chaining GEPs:
1061  *      // possible unsigned wraparound, don't use InBounds:
1062  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1063  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1064  *
1065  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1066  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1067  */
1068 static LLVMValueRef
1069 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1070                      LLVMValueRef index, bool uniform, bool invariant,
1071                      bool no_unsigned_wraparound)
1072 {
1073         LLVMValueRef pointer, result;
1074
1075         if (no_unsigned_wraparound &&
1076             LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1077                 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1078         else
1079                 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1080
1081         if (uniform)
1082                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1083         result = LLVMBuildLoad(ctx->builder, pointer, "");
1084         if (invariant)
1085                 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1086         return result;
1087 }
1088
1089 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1090                            LLVMValueRef index)
1091 {
1092         return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1093 }
1094
1095 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1096                                      LLVMValueRef base_ptr, LLVMValueRef index)
1097 {
1098         return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1099 }
1100
1101 /* This assumes that there is no unsigned integer wraparound during the address
1102  * computation, excluding all GEPs within base_ptr. */
1103 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1104                                    LLVMValueRef base_ptr, LLVMValueRef index)
1105 {
1106         return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1107 }
1108
1109 /* See ac_build_load_custom() documentation. */
1110 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1111                                    LLVMValueRef base_ptr, LLVMValueRef index)
1112 {
1113         return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1114 }
1115
1116 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx,
1117                                       unsigned cache_policy)
1118 {
1119         return cache_policy |
1120                (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1121 }
1122
1123 static void
1124 ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
1125                                    LLVMValueRef rsrc,
1126                                    LLVMValueRef data,
1127                                    LLVMValueRef vindex,
1128                                    LLVMValueRef voffset,
1129                                    unsigned num_channels,
1130                                    unsigned cache_policy,
1131                                    bool use_format)
1132 {
1133         LLVMValueRef args[] = {
1134                 data,
1135                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1136                 vindex ? vindex : ctx->i32_0,
1137                 voffset,
1138                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1139                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1140         };
1141         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1142
1143         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1144         char name[256];
1145
1146         if (use_format) {
1147                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
1148                          type_names[func]);
1149         } else {
1150                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
1151                          type_names[func]);
1152         }
1153
1154         ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
1155                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1156 }
1157
1158 static void
1159 ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
1160                                    LLVMValueRef rsrc,
1161                                    LLVMValueRef data,
1162                                    LLVMValueRef vindex,
1163                                    LLVMValueRef voffset,
1164                                    LLVMValueRef soffset,
1165                                    unsigned num_channels,
1166                                    LLVMTypeRef return_channel_type,
1167                                    unsigned cache_policy,
1168                                    bool use_format,
1169                                    bool structurized)
1170 {
1171         LLVMValueRef args[6];
1172         int idx = 0;
1173         args[idx++] = data;
1174         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1175         if (structurized)
1176                 args[idx++] = vindex ? vindex : ctx->i32_0;
1177         args[idx++] = voffset ? voffset : ctx->i32_0;
1178         args[idx++] = soffset ? soffset : ctx->i32_0;
1179         args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1180         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1181         const char *indexing_kind = structurized ? "struct" : "raw";
1182         char name[256], type_name[8];
1183
1184         LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
1185         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1186
1187         if (use_format) {
1188                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1189                          indexing_kind, type_name);
1190         } else {
1191                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1192                          indexing_kind, type_name);
1193         }
1194
1195         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1196                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1197 }
1198
1199 void
1200 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1201                              LLVMValueRef rsrc,
1202                              LLVMValueRef data,
1203                              LLVMValueRef vindex,
1204                              LLVMValueRef voffset,
1205                              unsigned num_channels,
1206                              unsigned cache_policy)
1207 {
1208         if (HAVE_LLVM >= 0x800) {
1209                 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
1210                                                    voffset, NULL, num_channels,
1211                                                    ctx->f32, cache_policy,
1212                                                    true, true);
1213         } else {
1214                 ac_build_llvm7_buffer_store_common(ctx, rsrc, data, vindex, voffset,
1215                                                    num_channels, cache_policy,
1216                                                    true);
1217         }
1218 }
1219
1220 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1221  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1222  * or v4i32 (num_channels=3,4).
1223  */
1224 void
1225 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1226                             LLVMValueRef rsrc,
1227                             LLVMValueRef vdata,
1228                             unsigned num_channels,
1229                             LLVMValueRef voffset,
1230                             LLVMValueRef soffset,
1231                             unsigned inst_offset,
1232                             unsigned cache_policy,
1233                             bool swizzle_enable_hint)
1234 {
1235         /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1236          * intrinsics. */
1237         if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1238                 LLVMValueRef v[3], v01;
1239
1240                 for (int i = 0; i < 3; i++) {
1241                         v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1242                                         LLVMConstInt(ctx->i32, i, 0), "");
1243                 }
1244                 v01 = ac_build_gather_values(ctx, v, 2);
1245
1246                 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1247                                             soffset, inst_offset, cache_policy,
1248                                             swizzle_enable_hint);
1249                 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1250                                             soffset, inst_offset + 8,
1251                                             cache_policy,
1252                                             swizzle_enable_hint);
1253                 return;
1254         }
1255
1256         /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1257          * (voffset is swizzled, but soffset isn't swizzled).
1258          * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1259          */
1260         if (!swizzle_enable_hint) {
1261                 LLVMValueRef offset = soffset;
1262
1263                 if (inst_offset)
1264                         offset = LLVMBuildAdd(ctx->builder, offset,
1265                                               LLVMConstInt(ctx->i32, inst_offset, 0), "");
1266
1267                 if (HAVE_LLVM >= 0x800) {
1268                         ac_build_llvm8_buffer_store_common(ctx, rsrc,
1269                                                            ac_to_float(ctx, vdata),
1270                                                            ctx->i32_0,
1271                                                            voffset, offset,
1272                                                            num_channels,
1273                                                            ctx->f32,
1274                                                            cache_policy,
1275                                                            false, false);
1276                 } else {
1277                         if (voffset)
1278                                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1279
1280                         ac_build_llvm7_buffer_store_common(ctx, rsrc,
1281                                                            ac_to_float(ctx, vdata),
1282                                                            ctx->i32_0, offset,
1283                                                            num_channels, cache_policy,
1284                                                            false);
1285                 }
1286                 return;
1287         }
1288
1289         static const unsigned dfmts[] = {
1290                 V_008F0C_BUF_DATA_FORMAT_32,
1291                 V_008F0C_BUF_DATA_FORMAT_32_32,
1292                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1293                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1294         };
1295         unsigned dfmt = dfmts[num_channels - 1];
1296         unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1297         LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1298
1299         ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1300                                    immoffset, num_channels, dfmt, nfmt, cache_policy);
1301 }
1302
1303 static LLVMValueRef
1304 ac_build_llvm7_buffer_load_common(struct ac_llvm_context *ctx,
1305                                   LLVMValueRef rsrc,
1306                                   LLVMValueRef vindex,
1307                                   LLVMValueRef voffset,
1308                                   unsigned num_channels,
1309                                   unsigned cache_policy,
1310                                   bool can_speculate,
1311                                   bool use_format)
1312 {
1313         LLVMValueRef args[] = {
1314                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1315                 vindex ? vindex : ctx->i32_0,
1316                 voffset,
1317                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1318                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1319         };
1320         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1321
1322         LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1323         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1324         char name[256];
1325
1326         if (use_format) {
1327                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1328                          type_names[func]);
1329         } else {
1330                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1331                          type_names[func]);
1332         }
1333
1334         return ac_build_intrinsic(ctx, name, types[func], args,
1335                                   ARRAY_SIZE(args),
1336                                   ac_get_load_intr_attribs(can_speculate));
1337 }
1338
1339 static LLVMValueRef
1340 ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
1341                                   LLVMValueRef rsrc,
1342                                   LLVMValueRef vindex,
1343                                   LLVMValueRef voffset,
1344                                   LLVMValueRef soffset,
1345                                   unsigned num_channels,
1346                                   LLVMTypeRef channel_type,
1347                                   unsigned cache_policy,
1348                                   bool can_speculate,
1349                                   bool use_format,
1350                                   bool structurized)
1351 {
1352         LLVMValueRef args[5];
1353         int idx = 0;
1354         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1355         if (structurized)
1356                 args[idx++] = vindex ? vindex : ctx->i32_0;
1357         args[idx++] = voffset ? voffset : ctx->i32_0;
1358         args[idx++] = soffset ? soffset : ctx->i32_0;
1359         args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1360         unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1361         const char *indexing_kind = structurized ? "struct" : "raw";
1362         char name[256], type_name[8];
1363
1364         LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1365         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1366
1367         if (use_format) {
1368                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1369                          indexing_kind, type_name);
1370         } else {
1371                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1372                          indexing_kind, type_name);
1373         }
1374
1375         return ac_build_intrinsic(ctx, name, type, args, idx,
1376                                   ac_get_load_intr_attribs(can_speculate));
1377 }
1378
1379 LLVMValueRef
1380 ac_build_buffer_load(struct ac_llvm_context *ctx,
1381                      LLVMValueRef rsrc,
1382                      int num_channels,
1383                      LLVMValueRef vindex,
1384                      LLVMValueRef voffset,
1385                      LLVMValueRef soffset,
1386                      unsigned inst_offset,
1387                      unsigned cache_policy,
1388                      bool can_speculate,
1389                      bool allow_smem)
1390 {
1391         LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1392         if (voffset)
1393                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1394         if (soffset)
1395                 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1396
1397         if (allow_smem && !(cache_policy & ac_slc) &&
1398             (!(cache_policy & ac_glc) || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= GFX8))) {
1399                 assert(vindex == NULL);
1400
1401                 LLVMValueRef result[8];
1402
1403                 for (int i = 0; i < num_channels; i++) {
1404                         if (i) {
1405                                 offset = LLVMBuildAdd(ctx->builder, offset,
1406                                                       LLVMConstInt(ctx->i32, 4, 0), "");
1407                         }
1408                         const char *intrname =
1409                                 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
1410                                                     : "llvm.SI.load.const.v4i32";
1411                         unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
1412                         LLVMValueRef args[3] = {
1413                                 rsrc,
1414                                 offset,
1415                                 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1416                         };
1417                         result[i] = ac_build_intrinsic(ctx, intrname,
1418                                                        ctx->f32, args, num_args,
1419                                                        AC_FUNC_ATTR_READNONE |
1420                                                        (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
1421                 }
1422                 if (num_channels == 1)
1423                         return result[0];
1424
1425                 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1426                         result[num_channels++] = LLVMGetUndef(ctx->f32);
1427                 return ac_build_gather_values(ctx, result, num_channels);
1428         }
1429
1430         if (HAVE_LLVM >= 0x0800) {
1431                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
1432                                                          offset, ctx->i32_0,
1433                                                          num_channels, ctx->f32,
1434                                                          cache_policy,
1435                                                          can_speculate, false,
1436                                                          false);
1437         }
1438
1439         return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, offset,
1440                                                  num_channels, cache_policy,
1441                                                  can_speculate, false);
1442 }
1443
1444 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1445                                          LLVMValueRef rsrc,
1446                                          LLVMValueRef vindex,
1447                                          LLVMValueRef voffset,
1448                                          unsigned num_channels,
1449                                          unsigned cache_policy,
1450                                          bool can_speculate)
1451 {
1452         if (HAVE_LLVM >= 0x800) {
1453                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1454                                                          num_channels, ctx->f32,
1455                                                          cache_policy, can_speculate, true, true);
1456         }
1457         return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, voffset,
1458                                                  num_channels, cache_policy,
1459                                                  can_speculate, true);
1460 }
1461
1462 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1463                                                   LLVMValueRef rsrc,
1464                                                   LLVMValueRef vindex,
1465                                                   LLVMValueRef voffset,
1466                                                   unsigned num_channels,
1467                                                   unsigned cache_policy,
1468                                                   bool can_speculate)
1469 {
1470         if (HAVE_LLVM >= 0x800) {
1471                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1472                                                          num_channels, ctx->f32,
1473                                                          cache_policy, can_speculate, true, true);
1474         }
1475
1476         LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1477         LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
1478         stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1479
1480         LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1481                                                       LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1482                                                       elem_count, stride, "");
1483
1484         LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1485                                                        LLVMConstInt(ctx->i32, 2, 0), "");
1486
1487         return ac_build_llvm7_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1488                                                  num_channels, cache_policy,
1489                                                  can_speculate, true);
1490 }
1491
1492 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
1493 /// value for LLVM8+ tbuffer intrinsics.
1494 static unsigned
1495 ac_get_tbuffer_format(struct ac_llvm_context *ctx,
1496                       unsigned dfmt, unsigned nfmt)
1497 {
1498         if (ctx->chip_class >= GFX10) {
1499                 unsigned format;
1500                 switch (dfmt) {
1501                 default: unreachable("bad dfmt");
1502                 case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
1503                 case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
1504                 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
1505                 case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
1506                 case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
1507                 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
1508                 case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
1509                 case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
1510                 case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break;
1511                 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
1512                 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break;
1513                 }
1514
1515                 // Use the regularity properties of the combined format enum.
1516                 //
1517                 // Note: float is incompatible with 8-bit data formats,
1518                 //       [us]{norm,scaled} are incomparible with 32-bit data formats.
1519                 //       [us]scaled are not writable.
1520                 switch (nfmt) {
1521                 case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
1522                 case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
1523                 case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
1524                 case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
1525                 default: unreachable("bad nfmt");
1526                 case V_008F0C_BUF_NUM_FORMAT_UINT: break;
1527                 case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
1528                 case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
1529                 }
1530
1531                 return format;
1532         } else {
1533                 return dfmt | (nfmt << 4);
1534         }
1535 }
1536
1537 static LLVMValueRef
1538 ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
1539                             LLVMValueRef rsrc,
1540                             LLVMValueRef vindex,
1541                             LLVMValueRef voffset,
1542                             LLVMValueRef soffset,
1543                             unsigned num_channels,
1544                             unsigned dfmt,
1545                             unsigned nfmt,
1546                             unsigned cache_policy,
1547                             bool can_speculate,
1548                             bool structurized)
1549 {
1550         LLVMValueRef args[6];
1551         int idx = 0;
1552         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1553         if (structurized)
1554                 args[idx++] = vindex ? vindex : ctx->i32_0;
1555         args[idx++] = voffset ? voffset : ctx->i32_0;
1556         args[idx++] = soffset ? soffset : ctx->i32_0;
1557         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
1558         args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1559         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1560         const char *indexing_kind = structurized ? "struct" : "raw";
1561         char name[256], type_name[8];
1562
1563         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1564         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1565
1566         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1567                  indexing_kind, type_name);
1568
1569         return ac_build_intrinsic(ctx, name, type, args, idx,
1570                                   ac_get_load_intr_attribs(can_speculate));
1571 }
1572
1573 static LLVMValueRef
1574 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1575                             LLVMValueRef rsrc,
1576                             LLVMValueRef vindex,
1577                             LLVMValueRef voffset,
1578                             LLVMValueRef soffset,
1579                             LLVMValueRef immoffset,
1580                             unsigned num_channels,
1581                             unsigned dfmt,
1582                             unsigned nfmt,
1583                             unsigned cache_policy,
1584                             bool can_speculate,
1585                             bool structurized) /* only matters for LLVM 8+ */
1586 {
1587         if (HAVE_LLVM >= 0x800) {
1588                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1589
1590                 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
1591                                                    soffset, num_channels,
1592                                                    dfmt, nfmt, cache_policy,
1593                                                    can_speculate, structurized);
1594         }
1595
1596         LLVMValueRef args[] = {
1597                 rsrc,
1598                 vindex ? vindex : ctx->i32_0,
1599                 voffset,
1600                 soffset,
1601                 immoffset,
1602                 LLVMConstInt(ctx->i32, dfmt, false),
1603                 LLVMConstInt(ctx->i32, nfmt, false),
1604                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
1605                 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
1606         };
1607         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1608         LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1609         const char *type_names[] = {"i32", "v2i32", "v4i32"};
1610         char name[256];
1611
1612         snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
1613                  type_names[func]);
1614
1615         return ac_build_intrinsic(ctx, name, types[func], args, 9,
1616                                   ac_get_load_intr_attribs(can_speculate));
1617 }
1618
1619 LLVMValueRef
1620 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1621                              LLVMValueRef rsrc,
1622                              LLVMValueRef vindex,
1623                              LLVMValueRef voffset,
1624                              LLVMValueRef soffset,
1625                              LLVMValueRef immoffset,
1626                              unsigned num_channels,
1627                              unsigned dfmt,
1628                              unsigned nfmt,
1629                              unsigned cache_policy,
1630                              bool can_speculate)
1631 {
1632         return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1633                                      immoffset, num_channels, dfmt, nfmt,
1634                                      cache_policy, can_speculate, true);
1635 }
1636
1637 LLVMValueRef
1638 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1639                           LLVMValueRef rsrc,
1640                           LLVMValueRef voffset,
1641                           LLVMValueRef soffset,
1642                           LLVMValueRef immoffset,
1643                           unsigned num_channels,
1644                           unsigned dfmt,
1645                           unsigned nfmt,
1646                           unsigned cache_policy,
1647                           bool can_speculate)
1648 {
1649         return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1650                                      immoffset, num_channels, dfmt, nfmt,
1651                                      cache_policy, can_speculate, false);
1652 }
1653
1654 LLVMValueRef
1655 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1656                             LLVMValueRef rsrc,
1657                             LLVMValueRef voffset,
1658                             LLVMValueRef soffset,
1659                             LLVMValueRef immoffset,
1660                             unsigned cache_policy)
1661 {
1662         LLVMValueRef res;
1663
1664         if (HAVE_LLVM >= 0x900) {
1665                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1666
1667                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1668                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1669                                                         voffset, soffset,
1670                                                         1, ctx->i16, cache_policy,
1671                                                         false, false, false);
1672         } else {
1673                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1674                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1675
1676                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1677                                                 immoffset, 1, dfmt, nfmt, cache_policy,
1678                                                 false);
1679
1680                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1681         }
1682
1683         return res;
1684 }
1685
1686 LLVMValueRef
1687 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1688                            LLVMValueRef rsrc,
1689                            LLVMValueRef voffset,
1690                            LLVMValueRef soffset,
1691                            LLVMValueRef immoffset,
1692                            unsigned cache_policy)
1693 {
1694         LLVMValueRef res;
1695
1696         if (HAVE_LLVM >= 0x900) {
1697                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1698
1699                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1700                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1701                                                         voffset, soffset,
1702                                                         1, ctx->i8, cache_policy,
1703                                                         false, false, false);
1704         } else {
1705                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1706                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1707
1708                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1709                                                 immoffset, 1, dfmt, nfmt, cache_policy,
1710                                                 false);
1711
1712                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1713         }
1714
1715         return res;
1716 }
1717
1718 /**
1719  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1720  *
1721  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1722  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1723  */
1724 static LLVMValueRef
1725 ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits)
1726 {
1727         assert(LLVMTypeOf(src) == ctx->i32);
1728
1729         LLVMValueRef tmp;
1730         LLVMValueRef mantissa;
1731         mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1732
1733         /* Converting normal numbers is just a shift + correcting the exponent bias */
1734         unsigned normal_shift = 23 - mant_bits;
1735         unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1736         LLVMValueRef shifted, normal;
1737
1738         shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1739         normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1740
1741         /* Converting nan/inf numbers is the same, but with a different exponent update */
1742         LLVMValueRef naninf;
1743         naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1744
1745         /* Converting denormals is the complex case: determine the leading zeros of the
1746          * mantissa to obtain the correct shift for the mantissa and exponent correction.
1747          */
1748         LLVMValueRef denormal;
1749         LLVMValueRef params[2] = {
1750                 mantissa,
1751                 ctx->i1true, /* result can be undef when arg is 0 */
1752         };
1753         LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32,
1754                                               params, 2, AC_FUNC_ATTR_READNONE);
1755
1756         /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1757         tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1758         denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1759
1760         unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1761         tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1762         tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1763         denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1764
1765         /* Select the final result. */
1766         LLVMValueRef result;
1767
1768         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1769                             LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1770         result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1771
1772         tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1773                             LLVMConstInt(ctx->i32, 1 << mant_bits, false), "");
1774         result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1775
1776         tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1777         result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1778
1779         return ac_to_float(ctx, result);
1780 }
1781
1782 /**
1783  * Generate a fully general open coded buffer format fetch with all required
1784  * fixups suitable for vertex fetch, using non-format buffer loads.
1785  *
1786  * Some combinations of argument values have special interpretations:
1787  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1788  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1789  *
1790  * \param log_size log(size of channel in bytes)
1791  * \param num_channels number of channels (1 to 4)
1792  * \param format AC_FETCH_FORMAT_xxx value
1793  * \param reverse whether XYZ channels are reversed
1794  * \param known_aligned whether the source is known to be aligned to hardware's
1795  *                      effective element size for loading the given format
1796  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1797  * \param rsrc buffer resource descriptor
1798  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1799  */
1800 LLVMValueRef
1801 ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
1802                                unsigned log_size,
1803                                unsigned num_channels,
1804                                unsigned format,
1805                                bool reverse,
1806                                bool known_aligned,
1807                                LLVMValueRef rsrc,
1808                                LLVMValueRef vindex,
1809                                LLVMValueRef voffset,
1810                                LLVMValueRef soffset,
1811                                unsigned cache_policy,
1812                                bool can_speculate)
1813 {
1814         LLVMValueRef tmp;
1815         unsigned load_log_size = log_size;
1816         unsigned load_num_channels = num_channels;
1817         if (log_size == 3) {
1818                 load_log_size = 2;
1819                 if (format == AC_FETCH_FORMAT_FLOAT) {
1820                         load_num_channels = 2 * num_channels;
1821                 } else {
1822                         load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1823                 }
1824         }
1825
1826         int log_recombine = 0;
1827         if (ctx->chip_class == GFX6 && !known_aligned) {
1828                 /* Avoid alignment restrictions by loading one byte at a time. */
1829                 load_num_channels <<= load_log_size;
1830                 log_recombine = load_log_size;
1831                 load_log_size = 0;
1832         } else if (load_num_channels == 2 || load_num_channels == 4) {
1833                 log_recombine = -util_logbase2(load_num_channels);
1834                 load_num_channels = 1;
1835                 load_log_size += -log_recombine;
1836         }
1837
1838         assert(load_log_size >= 2 || HAVE_LLVM >= 0x0900);
1839
1840         LLVMValueRef loads[32]; /* up to 32 bytes */
1841         for (unsigned i = 0; i < load_num_channels; ++i) {
1842                 tmp = LLVMBuildAdd(ctx->builder, soffset,
1843                                    LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1844                 if (HAVE_LLVM >= 0x0800) {
1845                         LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 :
1846                                                    load_log_size == 1 ? ctx->i16 : ctx->i32;
1847                         unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1848                         loads[i] = ac_build_llvm8_buffer_load_common(
1849                                         ctx, rsrc, vindex, voffset, tmp,
1850                                         num_channels, channel_type, cache_policy,
1851                                         can_speculate, false, true);
1852                 } else {
1853                         tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, "");
1854                         loads[i] = ac_build_llvm7_buffer_load_common(
1855                                         ctx, rsrc, vindex, tmp,
1856                                         1 << (load_log_size - 2), cache_policy, can_speculate, false);
1857                 }
1858                 if (load_log_size >= 2)
1859                         loads[i] = ac_to_integer(ctx, loads[i]);
1860         }
1861
1862         if (log_recombine > 0) {
1863                 /* Recombine bytes if necessary (GFX6 only) */
1864                 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1865
1866                 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1867                         LLVMValueRef accum = NULL;
1868                         for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1869                                 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1870                                 if (i == 0) {
1871                                         accum = tmp;
1872                                 } else {
1873                                         tmp = LLVMBuildShl(ctx->builder, tmp,
1874                                                            LLVMConstInt(dst_type, 8 * i, false), "");
1875                                         accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1876                                 }
1877                         }
1878                         loads[dst] = accum;
1879                 }
1880         } else if (log_recombine < 0) {
1881                 /* Split vectors of dwords */
1882                 if (load_log_size > 2) {
1883                         assert(load_num_channels == 1);
1884                         LLVMValueRef loaded = loads[0];
1885                         unsigned log_split = load_log_size - 2;
1886                         log_recombine += log_split;
1887                         load_num_channels = 1 << log_split;
1888                         load_log_size = 2;
1889                         for (unsigned i = 0; i < load_num_channels; ++i) {
1890                                 tmp = LLVMConstInt(ctx->i32, i, false);
1891                                 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1892                         }
1893                 }
1894
1895                 /* Further split dwords and shorts if required */
1896                 if (log_recombine < 0) {
1897                         for (unsigned src = load_num_channels,
1898                                       dst = load_num_channels << -log_recombine;
1899                              src > 0; --src) {
1900                                 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1901                                 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1902                                 LLVMValueRef loaded = loads[src - 1];
1903                                 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1904                                 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1905                                         tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1906                                         tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1907                                         loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1908                                 }
1909                         }
1910                 }
1911         }
1912
1913         if (log_size == 3) {
1914                 if (format == AC_FETCH_FORMAT_FLOAT) {
1915                         for (unsigned i = 0; i < num_channels; ++i) {
1916                                 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1917                                 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1918                         }
1919                 } else if (format == AC_FETCH_FORMAT_FIXED) {
1920                         /* 10_11_11_FLOAT */
1921                         LLVMValueRef data = loads[0];
1922                         LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1923                         LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1924                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1925                         LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1926                         LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1927
1928                         loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1929                         loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1930                         loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1931
1932                         num_channels = 3;
1933                         log_size = 2;
1934                         format = AC_FETCH_FORMAT_FLOAT;
1935                 } else {
1936                         /* 2_10_10_10 data formats */
1937                         LLVMValueRef data = loads[0];
1938                         LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1939                         LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1940                         loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1941                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1942                         loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1943                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1944                         loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1945                         tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1946                         loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1947
1948                         num_channels = 4;
1949                 }
1950         }
1951
1952         if (format == AC_FETCH_FORMAT_FLOAT) {
1953                 if (log_size != 2) {
1954                         for (unsigned chan = 0; chan < num_channels; ++chan) {
1955                                 tmp = ac_to_float(ctx, loads[chan]);
1956                                 if (log_size == 3)
1957                                         tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1958                                 else if (log_size == 1)
1959                                         tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1960                                 loads[chan] = ac_to_integer(ctx, tmp);
1961                         }
1962                 }
1963         } else if (format == AC_FETCH_FORMAT_UINT) {
1964                 if (log_size != 2) {
1965                         for (unsigned chan = 0; chan < num_channels; ++chan)
1966                                 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1967                 }
1968         } else if (format == AC_FETCH_FORMAT_SINT) {
1969                 if (log_size != 2) {
1970                         for (unsigned chan = 0; chan < num_channels; ++chan)
1971                                 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1972                 }
1973         } else {
1974                 bool unsign = format == AC_FETCH_FORMAT_UNORM ||
1975                               format == AC_FETCH_FORMAT_USCALED ||
1976                               format == AC_FETCH_FORMAT_UINT;
1977
1978                 for (unsigned chan = 0; chan < num_channels; ++chan) {
1979                         if (unsign) {
1980                                 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1981                         } else {
1982                                 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1983                         }
1984
1985                         LLVMValueRef scale = NULL;
1986                         if (format == AC_FETCH_FORMAT_FIXED) {
1987                                 assert(log_size == 2);
1988                                 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1989                         } else if (format == AC_FETCH_FORMAT_UNORM) {
1990                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1991                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1992                         } else if (format == AC_FETCH_FORMAT_SNORM) {
1993                                 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1994                                 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1995                         }
1996                         if (scale)
1997                                 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1998
1999                         if (format == AC_FETCH_FORMAT_SNORM) {
2000                                 /* Clamp to [-1, 1] */
2001                                 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
2002                                 LLVMValueRef clamp =
2003                                         LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
2004                                 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
2005                         }
2006
2007                         loads[chan] = ac_to_integer(ctx, tmp);
2008                 }
2009         }
2010
2011         while (num_channels < 4) {
2012                 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
2013                         loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
2014                 } else {
2015                         loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
2016                 }
2017                 num_channels++;
2018         }
2019
2020         if (reverse) {
2021                 tmp = loads[0];
2022                 loads[0] = loads[2];
2023                 loads[2] = tmp;
2024         }
2025
2026         return ac_build_gather_values(ctx, loads, 4);
2027 }
2028
2029 static void
2030 ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
2031                              LLVMValueRef rsrc,
2032                              LLVMValueRef vdata,
2033                              LLVMValueRef vindex,
2034                              LLVMValueRef voffset,
2035                              LLVMValueRef soffset,
2036                              unsigned num_channels,
2037                              unsigned dfmt,
2038                              unsigned nfmt,
2039                              unsigned cache_policy,
2040                              bool structurized)
2041 {
2042         LLVMValueRef args[7];
2043         int idx = 0;
2044         args[idx++] = vdata;
2045         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
2046         if (structurized)
2047                 args[idx++] = vindex ? vindex : ctx->i32_0;
2048         args[idx++] = voffset ? voffset : ctx->i32_0;
2049         args[idx++] = soffset ? soffset : ctx->i32_0;
2050         args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
2051         args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
2052         unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
2053         const char *indexing_kind = structurized ? "struct" : "raw";
2054         char name[256], type_name[8];
2055
2056         LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
2057         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
2058
2059         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
2060                  indexing_kind, type_name);
2061
2062         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
2063                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2064 }
2065
2066 static void
2067 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
2068                        LLVMValueRef rsrc,
2069                        LLVMValueRef vdata,
2070                        LLVMValueRef vindex,
2071                        LLVMValueRef voffset,
2072                        LLVMValueRef soffset,
2073                        LLVMValueRef immoffset,
2074                        unsigned num_channels,
2075                        unsigned dfmt,
2076                        unsigned nfmt,
2077                        unsigned cache_policy,
2078                        bool structurized) /* only matters for LLVM 8+ */
2079 {
2080         if (HAVE_LLVM >= 0x800) {
2081                 voffset = LLVMBuildAdd(ctx->builder,
2082                                        voffset ? voffset : ctx->i32_0,
2083                                        immoffset, "");
2084
2085                 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
2086                                              soffset, num_channels, dfmt, nfmt,
2087                                              cache_policy, structurized);
2088         } else {
2089                 LLVMValueRef params[] = {
2090                         vdata,
2091                         rsrc,
2092                         vindex ? vindex : ctx->i32_0,
2093                         voffset ? voffset : ctx->i32_0,
2094                         soffset ? soffset : ctx->i32_0,
2095                         immoffset,
2096                         LLVMConstInt(ctx->i32, dfmt, false),
2097                         LLVMConstInt(ctx->i32, nfmt, false),
2098                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
2099                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
2100                 };
2101                 unsigned func = CLAMP(num_channels, 1, 3) - 1;
2102                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
2103                 char name[256];
2104
2105                 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
2106                          type_names[func]);
2107
2108                 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
2109                                    AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2110         }
2111 }
2112
2113 void
2114 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
2115                               LLVMValueRef rsrc,
2116                               LLVMValueRef vdata,
2117                               LLVMValueRef vindex,
2118                               LLVMValueRef voffset,
2119                               LLVMValueRef soffset,
2120                               LLVMValueRef immoffset,
2121                               unsigned num_channels,
2122                               unsigned dfmt,
2123                               unsigned nfmt,
2124                               unsigned cache_policy)
2125 {
2126         ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
2127                                immoffset, num_channels, dfmt, nfmt, cache_policy,
2128                                true);
2129 }
2130
2131 void
2132 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
2133                            LLVMValueRef rsrc,
2134                            LLVMValueRef vdata,
2135                            LLVMValueRef voffset,
2136                            LLVMValueRef soffset,
2137                            LLVMValueRef immoffset,
2138                            unsigned num_channels,
2139                            unsigned dfmt,
2140                            unsigned nfmt,
2141                            unsigned cache_policy)
2142 {
2143         ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
2144                                immoffset, num_channels, dfmt, nfmt, cache_policy,
2145                                false);
2146 }
2147
2148 void
2149 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
2150                              LLVMValueRef rsrc,
2151                              LLVMValueRef vdata,
2152                              LLVMValueRef voffset,
2153                              LLVMValueRef soffset,
2154                              unsigned cache_policy)
2155 {
2156         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
2157
2158         if (HAVE_LLVM >= 0x900) {
2159                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2160                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2161                                                    voffset, soffset, 1,
2162                                                    ctx->i16, cache_policy,
2163                                                    false, false);
2164         } else {
2165                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
2166                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2167
2168                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2169
2170                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2171                                            ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2172         }
2173 }
2174
2175 void
2176 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
2177                             LLVMValueRef rsrc,
2178                             LLVMValueRef vdata,
2179                             LLVMValueRef voffset,
2180                             LLVMValueRef soffset,
2181                             unsigned cache_policy)
2182 {
2183         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
2184
2185         if (HAVE_LLVM >= 0x900) {
2186                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2187                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2188                                                    voffset, soffset, 1,
2189                                                    ctx->i8, cache_policy,
2190                                                    false, false);
2191         } else {
2192                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
2193                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2194
2195                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2196
2197                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2198                                            ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2199         }
2200 }
2201 /**
2202  * Set range metadata on an instruction.  This can only be used on load and
2203  * call instructions.  If you know an instruction can only produce the values
2204  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
2205  * \p lo is the minimum value inclusive.
2206  * \p hi is the maximum value exclusive.
2207  */
2208 static void set_range_metadata(struct ac_llvm_context *ctx,
2209                                LLVMValueRef value, unsigned lo, unsigned hi)
2210 {
2211         LLVMValueRef range_md, md_args[2];
2212         LLVMTypeRef type = LLVMTypeOf(value);
2213         LLVMContextRef context = LLVMGetTypeContext(type);
2214
2215         md_args[0] = LLVMConstInt(type, lo, false);
2216         md_args[1] = LLVMConstInt(type, hi, false);
2217         range_md = LLVMMDNodeInContext(context, md_args, 2);
2218         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
2219 }
2220
2221 LLVMValueRef
2222 ac_get_thread_id(struct ac_llvm_context *ctx)
2223 {
2224         LLVMValueRef tid;
2225
2226         LLVMValueRef tid_args[2];
2227         tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
2228         tid_args[1] = ctx->i32_0;
2229         tid_args[1] = ac_build_intrinsic(ctx,
2230                                          "llvm.amdgcn.mbcnt.lo", ctx->i32,
2231                                          tid_args, 2, AC_FUNC_ATTR_READNONE);
2232
2233         if (ctx->wave_size == 32) {
2234                 tid = tid_args[1];
2235         } else {
2236                 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
2237                                          ctx->i32, tid_args,
2238                                          2, AC_FUNC_ATTR_READNONE);
2239         }
2240         set_range_metadata(ctx, tid, 0, ctx->wave_size);
2241         return tid;
2242 }
2243
2244 /*
2245  * AMD GCN implements derivatives using the local data store (LDS)
2246  * All writes to the LDS happen in all executing threads at
2247  * the same time. TID is the Thread ID for the current
2248  * thread and is a value between 0 and 63, representing
2249  * the thread's position in the wavefront.
2250  *
2251  * For the pixel shader threads are grouped into quads of four pixels.
2252  * The TIDs of the pixels of a quad are:
2253  *
2254  *  +------+------+
2255  *  |4n + 0|4n + 1|
2256  *  +------+------+
2257  *  |4n + 2|4n + 3|
2258  *  +------+------+
2259  *
2260  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
2261  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
2262  * the current pixel's column, and masking with 0xfffffffe yields the TID
2263  * of the left pixel of the current pixel's row.
2264  *
2265  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
2266  * adding 2 yields the TID of the pixel below the top pixel.
2267  */
2268 LLVMValueRef
2269 ac_build_ddxy(struct ac_llvm_context *ctx,
2270               uint32_t mask,
2271               int idx,
2272               LLVMValueRef val)
2273 {
2274         unsigned tl_lanes[4], trbl_lanes[4];
2275         char name[32], type[8];
2276         LLVMValueRef tl, trbl;
2277         LLVMTypeRef result_type;
2278         LLVMValueRef result;
2279
2280         result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
2281
2282         if (result_type == ctx->f16)
2283                 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
2284
2285         for (unsigned i = 0; i < 4; ++i) {
2286                 tl_lanes[i] = i & mask;
2287                 trbl_lanes[i] = (i & mask) + idx;
2288         }
2289
2290         tl = ac_build_quad_swizzle(ctx, val,
2291                                    tl_lanes[0], tl_lanes[1],
2292                                    tl_lanes[2], tl_lanes[3]);
2293         trbl = ac_build_quad_swizzle(ctx, val,
2294                                      trbl_lanes[0], trbl_lanes[1],
2295                                      trbl_lanes[2], trbl_lanes[3]);
2296
2297         if (result_type == ctx->f16) {
2298                 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
2299                 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
2300         }
2301
2302         tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
2303         trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
2304         result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
2305
2306         ac_build_type_name_for_intr(result_type, type, sizeof(type));
2307         snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
2308
2309         return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
2310 }
2311
2312 void
2313 ac_build_sendmsg(struct ac_llvm_context *ctx,
2314                  uint32_t msg,
2315                  LLVMValueRef wave_id)
2316 {
2317         LLVMValueRef args[2];
2318         args[0] = LLVMConstInt(ctx->i32, msg, false);
2319         args[1] = wave_id;
2320         ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
2321 }
2322
2323 LLVMValueRef
2324 ac_build_imsb(struct ac_llvm_context *ctx,
2325               LLVMValueRef arg,
2326               LLVMTypeRef dst_type)
2327 {
2328         LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
2329                                               dst_type, &arg, 1,
2330                                               AC_FUNC_ATTR_READNONE);
2331
2332         /* The HW returns the last bit index from MSB, but NIR/TGSI wants
2333          * the index from LSB. Invert it by doing "31 - msb". */
2334         msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
2335                            msb, "");
2336
2337         LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
2338         LLVMValueRef cond = LLVMBuildOr(ctx->builder,
2339                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2340                                                       arg, ctx->i32_0, ""),
2341                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2342                                                       arg, all_ones, ""), "");
2343
2344         return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
2345 }
2346
2347 LLVMValueRef
2348 ac_build_umsb(struct ac_llvm_context *ctx,
2349               LLVMValueRef arg,
2350               LLVMTypeRef dst_type)
2351 {
2352         const char *intrin_name;
2353         LLVMTypeRef type;
2354         LLVMValueRef highest_bit;
2355         LLVMValueRef zero;
2356         unsigned bitsize;
2357
2358         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
2359         switch (bitsize) {
2360         case 64:
2361                 intrin_name = "llvm.ctlz.i64";
2362                 type = ctx->i64;
2363                 highest_bit = LLVMConstInt(ctx->i64, 63, false);
2364                 zero = ctx->i64_0;
2365                 break;
2366         case 32:
2367                 intrin_name = "llvm.ctlz.i32";
2368                 type = ctx->i32;
2369                 highest_bit = LLVMConstInt(ctx->i32, 31, false);
2370                 zero = ctx->i32_0;
2371                 break;
2372         case 16:
2373                 intrin_name = "llvm.ctlz.i16";
2374                 type = ctx->i16;
2375                 highest_bit = LLVMConstInt(ctx->i16, 15, false);
2376                 zero = ctx->i16_0;
2377                 break;
2378         case 8:
2379                 intrin_name = "llvm.ctlz.i8";
2380                 type = ctx->i8;
2381                 highest_bit = LLVMConstInt(ctx->i8, 7, false);
2382                 zero = ctx->i8_0;
2383                 break;
2384         default:
2385                 unreachable(!"invalid bitsize");
2386                 break;
2387         }
2388
2389         LLVMValueRef params[2] = {
2390                 arg,
2391                 ctx->i1true,
2392         };
2393
2394         LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
2395                                               params, 2,
2396                                               AC_FUNC_ATTR_READNONE);
2397
2398         /* The HW returns the last bit index from MSB, but TGSI/NIR wants
2399          * the index from LSB. Invert it by doing "31 - msb". */
2400         msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
2401
2402         if (bitsize == 64) {
2403                 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
2404         } else if (bitsize < 32) {
2405                 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
2406         }
2407
2408         /* check for zero */
2409         return LLVMBuildSelect(ctx->builder,
2410                                LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
2411                                LLVMConstInt(ctx->i32, -1, true), msb, "");
2412 }
2413
2414 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
2415                            LLVMValueRef b)
2416 {
2417         char name[64];
2418         snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2419         LLVMValueRef args[2] = {a, b};
2420         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2421                                   AC_FUNC_ATTR_READNONE);
2422 }
2423
2424 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
2425                            LLVMValueRef b)
2426 {
2427         char name[64];
2428         snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2429         LLVMValueRef args[2] = {a, b};
2430         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2431                                   AC_FUNC_ATTR_READNONE);
2432 }
2433
2434 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2435                            LLVMValueRef b)
2436 {
2437         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2438         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2439 }
2440
2441 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2442                            LLVMValueRef b)
2443 {
2444         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2445         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2446 }
2447
2448 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2449                            LLVMValueRef b)
2450 {
2451         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2452         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2453 }
2454
2455 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
2456                            LLVMValueRef b)
2457 {
2458         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
2459         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2460 }
2461
2462 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2463 {
2464         LLVMTypeRef t = LLVMTypeOf(value);
2465         return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2466                              LLVMConstReal(t, 1.0));
2467 }
2468
2469 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2470 {
2471         LLVMValueRef args[9];
2472
2473         args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2474         args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2475
2476         if (a->compr) {
2477                 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2478                 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2479
2480                 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2481                                 v2i16, "");
2482                 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2483                                 v2i16, "");
2484                 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2485                 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2486
2487                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2488                                    ctx->voidt, args, 6, 0);
2489         } else {
2490                 args[2] = a->out[0];
2491                 args[3] = a->out[1];
2492                 args[4] = a->out[2];
2493                 args[5] = a->out[3];
2494                 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2495                 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2496
2497                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2498                                    ctx->voidt, args, 8, 0);
2499         }
2500 }
2501
2502 void ac_build_export_null(struct ac_llvm_context *ctx)
2503 {
2504         struct ac_export_args args;
2505
2506         args.enabled_channels = 0x0; /* enabled channels */
2507         args.valid_mask = 1; /* whether the EXEC mask is valid */
2508         args.done = 1; /* DONE bit */
2509         args.target = V_008DFC_SQ_EXP_NULL;
2510         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2511         args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2512         args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2513         args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2514         args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2515
2516         ac_build_export(ctx, &args);
2517 }
2518
2519 static unsigned ac_num_coords(enum ac_image_dim dim)
2520 {
2521         switch (dim) {
2522         case ac_image_1d:
2523                 return 1;
2524         case ac_image_2d:
2525         case ac_image_1darray:
2526                  return 2;
2527         case ac_image_3d:
2528         case ac_image_cube:
2529         case ac_image_2darray:
2530         case ac_image_2dmsaa:
2531                 return 3;
2532         case ac_image_2darraymsaa:
2533                 return 4;
2534         default:
2535                 unreachable("ac_num_coords: bad dim");
2536         }
2537 }
2538
2539 static unsigned ac_num_derivs(enum ac_image_dim dim)
2540 {
2541         switch (dim) {
2542         case ac_image_1d:
2543         case ac_image_1darray:
2544                 return 2;
2545         case ac_image_2d:
2546         case ac_image_2darray:
2547         case ac_image_cube:
2548                 return 4;
2549         case ac_image_3d:
2550                 return 6;
2551         case ac_image_2dmsaa:
2552         case ac_image_2darraymsaa:
2553         default:
2554                 unreachable("derivatives not supported");
2555         }
2556 }
2557
2558 static const char *get_atomic_name(enum ac_atomic_op op)
2559 {
2560         switch (op) {
2561         case ac_atomic_swap: return "swap";
2562         case ac_atomic_add: return "add";
2563         case ac_atomic_sub: return "sub";
2564         case ac_atomic_smin: return "smin";
2565         case ac_atomic_umin: return "umin";
2566         case ac_atomic_smax: return "smax";
2567         case ac_atomic_umax: return "umax";
2568         case ac_atomic_and: return "and";
2569         case ac_atomic_or: return "or";
2570         case ac_atomic_xor: return "xor";
2571         }
2572         unreachable("bad atomic op");
2573 }
2574
2575 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2576                                    struct ac_image_args *a)
2577 {
2578         const char *overload[3] = { "", "", "" };
2579         unsigned num_overloads = 0;
2580         LLVMValueRef args[18];
2581         unsigned num_args = 0;
2582         enum ac_image_dim dim = a->dim;
2583
2584         assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2585                !a->level_zero);
2586         assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2587                 a->opcode != ac_image_store_mip) ||
2588                a->lod);
2589         assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2590                (!a->compare && !a->offset));
2591         assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2592                 a->opcode == ac_image_get_lod) ||
2593                !a->bias);
2594         assert((a->bias ? 1 : 0) +
2595                (a->lod ? 1 : 0) +
2596                (a->level_zero ? 1 : 0) +
2597                (a->derivs[0] ? 1 : 0) <= 1);
2598
2599         if (a->opcode == ac_image_get_lod) {
2600                 switch (dim) {
2601                 case ac_image_1darray:
2602                         dim = ac_image_1d;
2603                         break;
2604                 case ac_image_2darray:
2605                 case ac_image_cube:
2606                         dim = ac_image_2d;
2607                         break;
2608                 default:
2609                         break;
2610                 }
2611         }
2612
2613         bool sample = a->opcode == ac_image_sample ||
2614                       a->opcode == ac_image_gather4 ||
2615                       a->opcode == ac_image_get_lod;
2616         bool atomic = a->opcode == ac_image_atomic ||
2617                       a->opcode == ac_image_atomic_cmpswap;
2618         bool load = a->opcode == ac_image_sample ||
2619                     a->opcode == ac_image_gather4 ||
2620                     a->opcode == ac_image_load ||
2621                     a->opcode == ac_image_load_mip;
2622         LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2623
2624         if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2625                 args[num_args++] = a->data[0];
2626                 if (a->opcode == ac_image_atomic_cmpswap)
2627                         args[num_args++] = a->data[1];
2628         }
2629
2630         if (!atomic)
2631                 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2632
2633         if (a->offset)
2634                 args[num_args++] = ac_to_integer(ctx, a->offset);
2635         if (a->bias) {
2636                 args[num_args++] = ac_to_float(ctx, a->bias);
2637                 overload[num_overloads++] = ".f32";
2638         }
2639         if (a->compare)
2640                 args[num_args++] = ac_to_float(ctx, a->compare);
2641         if (a->derivs[0]) {
2642                 unsigned count = ac_num_derivs(dim);
2643                 for (unsigned i = 0; i < count; ++i)
2644                         args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2645                 overload[num_overloads++] = ".f32";
2646         }
2647         unsigned num_coords =
2648                 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2649         for (unsigned i = 0; i < num_coords; ++i)
2650                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2651         if (a->lod)
2652                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2653         overload[num_overloads++] = sample ? ".f32" : ".i32";
2654
2655         args[num_args++] = a->resource;
2656         if (sample) {
2657                 args[num_args++] = a->sampler;
2658                 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2659         }
2660
2661         args[num_args++] = ctx->i32_0; /* texfailctrl */
2662         args[num_args++] = LLVMConstInt(ctx->i32,
2663                                         load ? get_load_cache_policy(ctx, a->cache_policy) :
2664                                                a->cache_policy, false);
2665
2666         const char *name;
2667         const char *atomic_subop = "";
2668         switch (a->opcode) {
2669         case ac_image_sample: name = "sample"; break;
2670         case ac_image_gather4: name = "gather4"; break;
2671         case ac_image_load: name = "load"; break;
2672         case ac_image_load_mip: name = "load.mip"; break;
2673         case ac_image_store: name = "store"; break;
2674         case ac_image_store_mip: name = "store.mip"; break;
2675         case ac_image_atomic:
2676                 name = "atomic.";
2677                 atomic_subop = get_atomic_name(a->atomic);
2678                 break;
2679         case ac_image_atomic_cmpswap:
2680                 name = "atomic.";
2681                 atomic_subop = "cmpswap";
2682                 break;
2683         case ac_image_get_lod: name = "getlod"; break;
2684         case ac_image_get_resinfo: name = "getresinfo"; break;
2685         default: unreachable("invalid image opcode");
2686         }
2687
2688         const char *dimname;
2689         switch (dim) {
2690         case ac_image_1d: dimname = "1d"; break;
2691         case ac_image_2d: dimname = "2d"; break;
2692         case ac_image_3d: dimname = "3d"; break;
2693         case ac_image_cube: dimname = "cube"; break;
2694         case ac_image_1darray: dimname = "1darray"; break;
2695         case ac_image_2darray: dimname = "2darray"; break;
2696         case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2697         case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2698         default: unreachable("invalid dim");
2699         }
2700
2701         bool lod_suffix =
2702                 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2703         char intr_name[96];
2704         snprintf(intr_name, sizeof(intr_name),
2705                  "llvm.amdgcn.image.%s%s" /* base name */
2706                  "%s%s%s" /* sample/gather modifiers */
2707                  ".%s.%s%s%s%s", /* dimension and type overloads */
2708                  name, atomic_subop,
2709                  a->compare ? ".c" : "",
2710                  a->bias ? ".b" :
2711                  lod_suffix ? ".l" :
2712                  a->derivs[0] ? ".d" :
2713                  a->level_zero ? ".lz" : "",
2714                  a->offset ? ".o" : "",
2715                  dimname,
2716                  atomic ? "i32" : "v4f32",
2717                  overload[0], overload[1], overload[2]);
2718
2719         LLVMTypeRef retty;
2720         if (atomic)
2721                 retty = ctx->i32;
2722         else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2723                 retty = ctx->voidt;
2724         else
2725                 retty = ctx->v4f32;
2726
2727         LLVMValueRef result =
2728                 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2729                                    a->attributes);
2730         if (!sample && retty == ctx->v4f32) {
2731                 result = LLVMBuildBitCast(ctx->builder, result,
2732                                           ctx->v4i32, "");
2733         }
2734         return result;
2735 }
2736
2737 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2738                                     LLVMValueRef args[2])
2739 {
2740         LLVMTypeRef v2f16 =
2741                 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2742
2743         return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2744                                   args, 2, AC_FUNC_ATTR_READNONE);
2745 }
2746
2747 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2748                                      LLVMValueRef args[2])
2749 {
2750         LLVMValueRef res =
2751                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2752                                    ctx->v2i16, args, 2,
2753                                    AC_FUNC_ATTR_READNONE);
2754         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2755 }
2756
2757 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2758                                      LLVMValueRef args[2])
2759 {
2760         LLVMValueRef res =
2761                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2762                                    ctx->v2i16, args, 2,
2763                                    AC_FUNC_ATTR_READNONE);
2764         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2765 }
2766
2767 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2768 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2769                                  LLVMValueRef args[2], unsigned bits, bool hi)
2770 {
2771         assert(bits == 8 || bits == 10 || bits == 16);
2772
2773         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2774                 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2775         LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2776                 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2777         LLVMValueRef max_alpha =
2778                 bits != 10 ? max_rgb : ctx->i32_1;
2779         LLVMValueRef min_alpha =
2780                 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2781
2782         /* Clamp. */
2783         if (bits != 16) {
2784                 for (int i = 0; i < 2; i++) {
2785                         bool alpha = hi && i == 1;
2786                         args[i] = ac_build_imin(ctx, args[i],
2787                                                 alpha ? max_alpha : max_rgb);
2788                         args[i] = ac_build_imax(ctx, args[i],
2789                                                 alpha ? min_alpha : min_rgb);
2790                 }
2791         }
2792
2793         LLVMValueRef res =
2794                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2795                                    ctx->v2i16, args, 2,
2796                                    AC_FUNC_ATTR_READNONE);
2797         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2798 }
2799
2800 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2801 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2802                                  LLVMValueRef args[2], unsigned bits, bool hi)
2803 {
2804         assert(bits == 8 || bits == 10 || bits == 16);
2805
2806         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2807                 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2808         LLVMValueRef max_alpha =
2809                 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2810
2811         /* Clamp. */
2812         if (bits != 16) {
2813                 for (int i = 0; i < 2; i++) {
2814                         bool alpha = hi && i == 1;
2815                         args[i] = ac_build_umin(ctx, args[i],
2816                                                 alpha ? max_alpha : max_rgb);
2817                 }
2818         }
2819
2820         LLVMValueRef res =
2821                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2822                                    ctx->v2i16, args, 2,
2823                                    AC_FUNC_ATTR_READNONE);
2824         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2825 }
2826
2827 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2828 {
2829         return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2830                                   &i1, 1, AC_FUNC_ATTR_READNONE);
2831 }
2832
2833 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2834 {
2835         ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2836                            &i1, 1, 0);
2837 }
2838
2839 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2840                           LLVMValueRef offset, LLVMValueRef width,
2841                           bool is_signed)
2842 {
2843         LLVMValueRef args[] = {
2844                 input,
2845                 offset,
2846                 width,
2847         };
2848
2849         LLVMValueRef result = ac_build_intrinsic(ctx,
2850                                                  is_signed ? "llvm.amdgcn.sbfe.i32" :
2851                                                              "llvm.amdgcn.ubfe.i32",
2852                                                  ctx->i32, args, 3,
2853                                                  AC_FUNC_ATTR_READNONE);
2854
2855         if (HAVE_LLVM < 0x0800) {
2856                 /* FIXME: LLVM 7+ returns incorrect result when count is 0.
2857                  * https://bugs.freedesktop.org/show_bug.cgi?id=107276
2858                  */
2859                 LLVMValueRef zero = ctx->i32_0;
2860                 LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, width, zero, "");
2861                 result = LLVMBuildSelect(ctx->builder, icond, zero, result, "");
2862         }
2863
2864         return result;
2865 }
2866
2867 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2868                            LLVMValueRef s1, LLVMValueRef s2)
2869 {
2870         return LLVMBuildAdd(ctx->builder,
2871                             LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2872 }
2873
2874 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2875                            LLVMValueRef s1, LLVMValueRef s2)
2876 {
2877         return LLVMBuildFAdd(ctx->builder,
2878                              LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2879 }
2880
2881 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2882 {
2883         if (!wait_flags)
2884                 return;
2885
2886         unsigned lgkmcnt = 63;
2887         unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2888         unsigned vscnt = 63;
2889
2890         if (wait_flags & AC_WAIT_LGKM)
2891                 lgkmcnt = 0;
2892         if (wait_flags & AC_WAIT_VLOAD)
2893                 vmcnt = 0;
2894
2895         if (wait_flags & AC_WAIT_VSTORE) {
2896                 if (ctx->chip_class >= GFX10)
2897                         vscnt = 0;
2898                 else
2899                         vmcnt = 0;
2900         }
2901
2902         /* There is no intrinsic for vscnt(0), so use a fence. */
2903         if ((wait_flags & AC_WAIT_LGKM &&
2904              wait_flags & AC_WAIT_VLOAD &&
2905              wait_flags & AC_WAIT_VSTORE) ||
2906             vscnt == 0) {
2907                 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2908                 return;
2909         }
2910
2911         unsigned simm16 = (lgkmcnt << 8) |
2912                           (7 << 4) | /* expcnt */
2913                           (vmcnt & 0xf) |
2914                           ((vmcnt >> 4) << 14);
2915
2916         LLVMValueRef args[1] = {
2917                 LLVMConstInt(ctx->i32, simm16, false),
2918         };
2919         ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2920                            ctx->voidt, args, 1, 0);
2921 }
2922
2923 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
2924                             LLVMValueRef src1, LLVMValueRef src2,
2925                             unsigned bitsize)
2926 {
2927         LLVMTypeRef type;
2928         char *intr;
2929
2930         if (bitsize == 16) {
2931                 intr = "llvm.amdgcn.fmed3.f16";
2932                 type = ctx->f16;
2933         } else if (bitsize == 32) {
2934                 intr = "llvm.amdgcn.fmed3.f32";
2935                 type = ctx->f32;
2936         } else {
2937                 intr = "llvm.amdgcn.fmed3.f64";
2938                 type = ctx->f64;
2939         }
2940
2941         LLVMValueRef params[] = {
2942                 src0,
2943                 src1,
2944                 src2,
2945         };
2946         return ac_build_intrinsic(ctx, intr, type, params, 3,
2947                                   AC_FUNC_ATTR_READNONE);
2948 }
2949
2950 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2951                             unsigned bitsize)
2952 {
2953         LLVMTypeRef type;
2954         char *intr;
2955
2956         if (bitsize == 16) {
2957                 intr = "llvm.amdgcn.fract.f16";
2958                 type = ctx->f16;
2959         } else if (bitsize == 32) {
2960                 intr = "llvm.amdgcn.fract.f32";
2961                 type = ctx->f32;
2962         } else {
2963                 intr = "llvm.amdgcn.fract.f64";
2964                 type = ctx->f64;
2965         }
2966
2967         LLVMValueRef params[] = {
2968                 src0,
2969         };
2970         return ac_build_intrinsic(ctx, intr, type, params, 1,
2971                                   AC_FUNC_ATTR_READNONE);
2972 }
2973
2974 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2975                             unsigned bitsize)
2976 {
2977         LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2978         LLVMValueRef zero = LLVMConstInt(type, 0, false);
2979         LLVMValueRef one = LLVMConstInt(type, 1, false);
2980
2981         LLVMValueRef cmp, val;
2982         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2983         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2984         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2985         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2986         return val;
2987 }
2988
2989 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2990                             unsigned bitsize)
2991 {
2992         LLVMValueRef cmp, val, zero, one;
2993         LLVMTypeRef type;
2994
2995         if (bitsize == 16) {
2996                 type = ctx->f16;
2997                 zero = ctx->f16_0;
2998                 one = ctx->f16_1;
2999         } else if (bitsize == 32) {
3000                 type = ctx->f32;
3001                 zero = ctx->f32_0;
3002                 one = ctx->f32_1;
3003         } else {
3004                 type = ctx->f64;
3005                 zero = ctx->f64_0;
3006                 one = ctx->f64_1;
3007         }
3008
3009         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
3010         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
3011         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
3012         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
3013         return val;
3014 }
3015
3016 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
3017 {
3018         LLVMValueRef result;
3019         unsigned bitsize;
3020
3021         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3022
3023         switch (bitsize) {
3024         case 64:
3025                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
3026                                             (LLVMValueRef []) { src0 }, 1,
3027                                             AC_FUNC_ATTR_READNONE);
3028
3029                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3030                 break;
3031         case 32:
3032                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
3033                                             (LLVMValueRef []) { src0 }, 1,
3034                                             AC_FUNC_ATTR_READNONE);
3035                 break;
3036         case 16:
3037                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
3038                                             (LLVMValueRef []) { src0 }, 1,
3039                                             AC_FUNC_ATTR_READNONE);
3040
3041                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3042                 break;
3043         case 8:
3044                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
3045                                             (LLVMValueRef []) { src0 }, 1,
3046                                             AC_FUNC_ATTR_READNONE);
3047
3048                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3049                 break;
3050         default:
3051                 unreachable(!"invalid bitsize");
3052                 break;
3053         }
3054
3055         return result;
3056 }
3057
3058 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
3059                                        LLVMValueRef src0)
3060 {
3061         LLVMValueRef result;
3062         unsigned bitsize;
3063
3064         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3065
3066         switch (bitsize) {
3067         case 64:
3068                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
3069                                             (LLVMValueRef []) { src0 }, 1,
3070                                             AC_FUNC_ATTR_READNONE);
3071
3072                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3073                 break;
3074         case 32:
3075                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
3076                                             (LLVMValueRef []) { src0 }, 1,
3077                                             AC_FUNC_ATTR_READNONE);
3078                 break;
3079         case 16:
3080                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
3081                                             (LLVMValueRef []) { src0 }, 1,
3082                                             AC_FUNC_ATTR_READNONE);
3083
3084                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3085                 break;
3086         case 8:
3087                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
3088                                             (LLVMValueRef []) { src0 }, 1,
3089                                             AC_FUNC_ATTR_READNONE);
3090
3091                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3092                 break;
3093         default:
3094                 unreachable(!"invalid bitsize");
3095                 break;
3096         }
3097
3098         return result;
3099 }
3100
3101 #define AC_EXP_TARGET           0
3102 #define AC_EXP_ENABLED_CHANNELS 1
3103 #define AC_EXP_OUT0             2
3104
3105 enum ac_ir_type {
3106         AC_IR_UNDEF,
3107         AC_IR_CONST,
3108         AC_IR_VALUE,
3109 };
3110
3111 struct ac_vs_exp_chan
3112 {
3113         LLVMValueRef value;
3114         float const_float;
3115         enum ac_ir_type type;
3116 };
3117
3118 struct ac_vs_exp_inst {
3119         unsigned offset;
3120         LLVMValueRef inst;
3121         struct ac_vs_exp_chan chan[4];
3122 };
3123
3124 struct ac_vs_exports {
3125         unsigned num;
3126         struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
3127 };
3128
3129 /* Return true if the PARAM export has been eliminated. */
3130 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
3131                                       uint32_t num_outputs,
3132                                       struct ac_vs_exp_inst *exp)
3133 {
3134         unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
3135         bool is_zero[4] = {}, is_one[4] = {};
3136
3137         for (i = 0; i < 4; i++) {
3138                 /* It's a constant expression. Undef outputs are eliminated too. */
3139                 if (exp->chan[i].type == AC_IR_UNDEF) {
3140                         is_zero[i] = true;
3141                         is_one[i] = true;
3142                 } else if (exp->chan[i].type == AC_IR_CONST) {
3143                         if (exp->chan[i].const_float == 0)
3144                                 is_zero[i] = true;
3145                         else if (exp->chan[i].const_float == 1)
3146                                 is_one[i] = true;
3147                         else
3148                                 return false; /* other constant */
3149                 } else
3150                         return false;
3151         }
3152
3153         /* Only certain combinations of 0 and 1 can be eliminated. */
3154         if (is_zero[0] && is_zero[1] && is_zero[2])
3155                 default_val = is_zero[3] ? 0 : 1;
3156         else if (is_one[0] && is_one[1] && is_one[2])
3157                 default_val = is_zero[3] ? 2 : 3;
3158         else
3159                 return false;
3160
3161         /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
3162         LLVMInstructionEraseFromParent(exp->inst);
3163
3164         /* Change OFFSET to DEFAULT_VAL. */
3165         for (i = 0; i < num_outputs; i++) {
3166                 if (vs_output_param_offset[i] == exp->offset) {
3167                         vs_output_param_offset[i] =
3168                                 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
3169                         break;
3170                 }
3171         }
3172         return true;
3173 }
3174
3175 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
3176                                            uint8_t *vs_output_param_offset,
3177                                            uint32_t num_outputs,
3178                                            struct ac_vs_exports *processed,
3179                                            struct ac_vs_exp_inst *exp)
3180 {
3181         unsigned p, copy_back_channels = 0;
3182
3183         /* See if the output is already in the list of processed outputs.
3184          * The LLVMValueRef comparison relies on SSA.
3185          */
3186         for (p = 0; p < processed->num; p++) {
3187                 bool different = false;
3188
3189                 for (unsigned j = 0; j < 4; j++) {
3190                         struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
3191                         struct ac_vs_exp_chan *c2 = &exp->chan[j];
3192
3193                         /* Treat undef as a match. */
3194                         if (c2->type == AC_IR_UNDEF)
3195                                 continue;
3196
3197                         /* If c1 is undef but c2 isn't, we can copy c2 to c1
3198                          * and consider the instruction duplicated.
3199                          */
3200                         if (c1->type == AC_IR_UNDEF) {
3201                                 copy_back_channels |= 1 << j;
3202                                 continue;
3203                         }
3204
3205                         /* Test whether the channels are not equal. */
3206                         if (c1->type != c2->type ||
3207                             (c1->type == AC_IR_CONST &&
3208                              c1->const_float != c2->const_float) ||
3209                             (c1->type == AC_IR_VALUE &&
3210                              c1->value != c2->value)) {
3211                                 different = true;
3212                                 break;
3213                         }
3214                 }
3215                 if (!different)
3216                         break;
3217
3218                 copy_back_channels = 0;
3219         }
3220         if (p == processed->num)
3221                 return false;
3222
3223         /* If a match was found, but the matching export has undef where the new
3224          * one has a normal value, copy the normal value to the undef channel.
3225          */
3226         struct ac_vs_exp_inst *match = &processed->exp[p];
3227
3228         /* Get current enabled channels mask. */
3229         LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
3230         unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
3231
3232         while (copy_back_channels) {
3233                 unsigned chan = u_bit_scan(&copy_back_channels);
3234
3235                 assert(match->chan[chan].type == AC_IR_UNDEF);
3236                 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
3237                                exp->chan[chan].value);
3238                 match->chan[chan] = exp->chan[chan];
3239
3240                 /* Update number of enabled channels because the original mask
3241                  * is not always 0xf.
3242                  */
3243                 enabled_channels |= (1 << chan);
3244                 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
3245                                LLVMConstInt(ctx->i32, enabled_channels, 0));
3246         }
3247
3248         /* The PARAM export is duplicated. Kill it. */
3249         LLVMInstructionEraseFromParent(exp->inst);
3250
3251         /* Change OFFSET to the matching export. */
3252         for (unsigned i = 0; i < num_outputs; i++) {
3253                 if (vs_output_param_offset[i] == exp->offset) {
3254                         vs_output_param_offset[i] = match->offset;
3255                         break;
3256                 }
3257         }
3258         return true;
3259 }
3260
3261 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
3262                             LLVMValueRef main_fn,
3263                             uint8_t *vs_output_param_offset,
3264                             uint32_t num_outputs,
3265                             uint8_t *num_param_exports)
3266 {
3267         LLVMBasicBlockRef bb;
3268         bool removed_any = false;
3269         struct ac_vs_exports exports;
3270
3271         exports.num = 0;
3272
3273         /* Process all LLVM instructions. */
3274         bb = LLVMGetFirstBasicBlock(main_fn);
3275         while (bb) {
3276                 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
3277
3278                 while (inst) {
3279                         LLVMValueRef cur = inst;
3280                         inst = LLVMGetNextInstruction(inst);
3281                         struct ac_vs_exp_inst exp;
3282
3283                         if (LLVMGetInstructionOpcode(cur) != LLVMCall)
3284                                 continue;
3285
3286                         LLVMValueRef callee = ac_llvm_get_called_value(cur);
3287
3288                         if (!ac_llvm_is_function(callee))
3289                                 continue;
3290
3291                         const char *name = LLVMGetValueName(callee);
3292                         unsigned num_args = LLVMCountParams(callee);
3293
3294                         /* Check if this is an export instruction. */
3295                         if ((num_args != 9 && num_args != 8) ||
3296                             (strcmp(name, "llvm.SI.export") &&
3297                              strcmp(name, "llvm.amdgcn.exp.f32")))
3298                                 continue;
3299
3300                         LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
3301                         unsigned target = LLVMConstIntGetZExtValue(arg);
3302
3303                         if (target < V_008DFC_SQ_EXP_PARAM)
3304                                 continue;
3305
3306                         target -= V_008DFC_SQ_EXP_PARAM;
3307
3308                         /* Parse the instruction. */
3309                         memset(&exp, 0, sizeof(exp));
3310                         exp.offset = target;
3311                         exp.inst = cur;
3312
3313                         for (unsigned i = 0; i < 4; i++) {
3314                                 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
3315
3316                                 exp.chan[i].value = v;
3317
3318                                 if (LLVMIsUndef(v)) {
3319                                         exp.chan[i].type = AC_IR_UNDEF;
3320                                 } else if (LLVMIsAConstantFP(v)) {
3321                                         LLVMBool loses_info;
3322                                         exp.chan[i].type = AC_IR_CONST;
3323                                         exp.chan[i].const_float =
3324                                                 LLVMConstRealGetDouble(v, &loses_info);
3325                                 } else {
3326                                         exp.chan[i].type = AC_IR_VALUE;
3327                                 }
3328                         }
3329
3330                         /* Eliminate constant and duplicated PARAM exports. */
3331                         if (ac_eliminate_const_output(vs_output_param_offset,
3332                                                       num_outputs, &exp) ||
3333                             ac_eliminate_duplicated_output(ctx,
3334                                                            vs_output_param_offset,
3335                                                            num_outputs, &exports,
3336                                                            &exp)) {
3337                                 removed_any = true;
3338                         } else {
3339                                 exports.exp[exports.num++] = exp;
3340                         }
3341                 }
3342                 bb = LLVMGetNextBasicBlock(bb);
3343         }
3344
3345         /* Remove holes in export memory due to removed PARAM exports.
3346          * This is done by renumbering all PARAM exports.
3347          */
3348         if (removed_any) {
3349                 uint8_t old_offset[VARYING_SLOT_MAX];
3350                 unsigned out, i;
3351
3352                 /* Make a copy of the offsets. We need the old version while
3353                  * we are modifying some of them. */
3354                 memcpy(old_offset, vs_output_param_offset,
3355                        sizeof(old_offset));
3356
3357                 for (i = 0; i < exports.num; i++) {
3358                         unsigned offset = exports.exp[i].offset;
3359
3360                         /* Update vs_output_param_offset. Multiple outputs can
3361                          * have the same offset.
3362                          */
3363                         for (out = 0; out < num_outputs; out++) {
3364                                 if (old_offset[out] == offset)
3365                                         vs_output_param_offset[out] = i;
3366                         }
3367
3368                         /* Change the PARAM offset in the instruction. */
3369                         LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
3370                                        LLVMConstInt(ctx->i32,
3371                                                     V_008DFC_SQ_EXP_PARAM + i, 0));
3372                 }
3373                 *num_param_exports = exports.num;
3374         }
3375 }
3376
3377 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
3378 {
3379         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
3380         ac_build_intrinsic(ctx,
3381                            "llvm.amdgcn.init.exec", ctx->voidt,
3382                            &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
3383 }
3384
3385 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
3386 {
3387         unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
3388         ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
3389                                      LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
3390                                      "lds");
3391 }
3392
3393 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
3394                          LLVMValueRef dw_addr)
3395 {
3396         return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
3397 }
3398
3399 void ac_lds_store(struct ac_llvm_context *ctx,
3400                   LLVMValueRef dw_addr,
3401                   LLVMValueRef value)
3402 {
3403         value = ac_to_integer(ctx, value);
3404         ac_build_indexed_store(ctx, ctx->lds,
3405                                dw_addr, value);
3406 }
3407
3408 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
3409                          LLVMTypeRef dst_type,
3410                          LLVMValueRef src0)
3411 {
3412         unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3413         const char *intrin_name;
3414         LLVMTypeRef type;
3415         LLVMValueRef zero;
3416
3417         switch (src0_bitsize) {
3418         case 64:
3419                 intrin_name = "llvm.cttz.i64";
3420                 type = ctx->i64;
3421                 zero = ctx->i64_0;
3422                 break;
3423         case 32:
3424                 intrin_name = "llvm.cttz.i32";
3425                 type = ctx->i32;
3426                 zero = ctx->i32_0;
3427                 break;
3428         case 16:
3429                 intrin_name = "llvm.cttz.i16";
3430                 type = ctx->i16;
3431                 zero = ctx->i16_0;
3432                 break;
3433         case 8:
3434                 intrin_name = "llvm.cttz.i8";
3435                 type = ctx->i8;
3436                 zero = ctx->i8_0;
3437                 break;
3438         default:
3439                 unreachable(!"invalid bitsize");
3440         }
3441
3442         LLVMValueRef params[2] = {
3443                 src0,
3444
3445                 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3446                  * add special code to check for x=0. The reason is that
3447                  * the LLVM behavior for x=0 is different from what we
3448                  * need here. However, LLVM also assumes that ffs(x) is
3449                  * in [0, 31], but GLSL expects that ffs(0) = -1, so
3450                  * a conditional assignment to handle 0 is still required.
3451                  *
3452                  * The hardware already implements the correct behavior.
3453                  */
3454                 ctx->i1true,
3455         };
3456
3457         LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
3458                                               params, 2,
3459                                               AC_FUNC_ATTR_READNONE);
3460
3461         if (src0_bitsize == 64) {
3462                 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3463         } else if (src0_bitsize < 32) {
3464                 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3465         }
3466
3467         /* TODO: We need an intrinsic to skip this conditional. */
3468         /* Check for zero: */
3469         return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
3470                                                            LLVMIntEQ, src0,
3471                                                            zero, ""),
3472                                LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3473 }
3474
3475 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3476 {
3477         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3478 }
3479
3480 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3481 {
3482         return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3483 }
3484
3485 static struct ac_llvm_flow *
3486 get_current_flow(struct ac_llvm_context *ctx)
3487 {
3488         if (ctx->flow_depth > 0)
3489                 return &ctx->flow[ctx->flow_depth - 1];
3490         return NULL;
3491 }
3492
3493 static struct ac_llvm_flow *
3494 get_innermost_loop(struct ac_llvm_context *ctx)
3495 {
3496         for (unsigned i = ctx->flow_depth; i > 0; --i) {
3497                 if (ctx->flow[i - 1].loop_entry_block)
3498                         return &ctx->flow[i - 1];
3499         }
3500         return NULL;
3501 }
3502
3503 static struct ac_llvm_flow *
3504 push_flow(struct ac_llvm_context *ctx)
3505 {
3506         struct ac_llvm_flow *flow;
3507
3508         if (ctx->flow_depth >= ctx->flow_depth_max) {
3509                 unsigned new_max = MAX2(ctx->flow_depth << 1,
3510                                         AC_LLVM_INITIAL_CF_DEPTH);
3511
3512                 ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
3513                 ctx->flow_depth_max = new_max;
3514         }
3515
3516         flow = &ctx->flow[ctx->flow_depth];
3517         ctx->flow_depth++;
3518
3519         flow->next_block = NULL;
3520         flow->loop_entry_block = NULL;
3521         return flow;
3522 }
3523
3524 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
3525                                 int label_id)
3526 {
3527         char buf[32];
3528         snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3529         LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3530 }
3531
3532 /* Append a basic block at the level of the parent flow.
3533  */
3534 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
3535                                             const char *name)
3536 {
3537         assert(ctx->flow_depth >= 1);
3538
3539         if (ctx->flow_depth >= 2) {
3540                 struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
3541
3542                 return LLVMInsertBasicBlockInContext(ctx->context,
3543                                                      flow->next_block, name);
3544         }
3545
3546         LLVMValueRef main_fn =
3547                 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3548         return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3549 }
3550
3551 /* Emit a branch to the given default target for the current block if
3552  * applicable -- that is, if the current block does not already contain a
3553  * branch from a break or continue.
3554  */
3555 static void emit_default_branch(LLVMBuilderRef builder,
3556                                 LLVMBasicBlockRef target)
3557 {
3558         if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3559                  LLVMBuildBr(builder, target);
3560 }
3561
3562 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3563 {
3564         struct ac_llvm_flow *flow = push_flow(ctx);
3565         flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3566         flow->next_block = append_basic_block(ctx, "ENDLOOP");
3567         set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3568         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3569         LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3570 }
3571
3572 void ac_build_break(struct ac_llvm_context *ctx)
3573 {
3574         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3575         LLVMBuildBr(ctx->builder, flow->next_block);
3576 }
3577
3578 void ac_build_continue(struct ac_llvm_context *ctx)
3579 {
3580         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3581         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3582 }
3583
3584 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3585 {
3586         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3587         LLVMBasicBlockRef endif_block;
3588
3589         assert(!current_branch->loop_entry_block);
3590
3591         endif_block = append_basic_block(ctx, "ENDIF");
3592         emit_default_branch(ctx->builder, endif_block);
3593
3594         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3595         set_basicblock_name(current_branch->next_block, "else", label_id);
3596
3597         current_branch->next_block = endif_block;
3598 }
3599
3600 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3601 {
3602         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3603
3604         assert(!current_branch->loop_entry_block);
3605
3606         emit_default_branch(ctx->builder, current_branch->next_block);
3607         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3608         set_basicblock_name(current_branch->next_block, "endif", label_id);
3609
3610         ctx->flow_depth--;
3611 }
3612
3613 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3614 {
3615         struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3616
3617         assert(current_loop->loop_entry_block);
3618
3619         emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3620
3621         LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3622         set_basicblock_name(current_loop->next_block, "endloop", label_id);
3623         ctx->flow_depth--;
3624 }
3625
3626 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3627 {
3628         struct ac_llvm_flow *flow = push_flow(ctx);
3629         LLVMBasicBlockRef if_block;
3630
3631         if_block = append_basic_block(ctx, "IF");
3632         flow->next_block = append_basic_block(ctx, "ELSE");
3633         set_basicblock_name(if_block, "if", label_id);
3634         LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3635         LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3636 }
3637
3638 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3639                  int label_id)
3640 {
3641         LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3642                                           value, ctx->f32_0, "");
3643         ac_build_ifcc(ctx, cond, label_id);
3644 }
3645
3646 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3647                   int label_id)
3648 {
3649         LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3650                                           ac_to_integer(ctx, value),
3651                                           ctx->i32_0, "");
3652         ac_build_ifcc(ctx, cond, label_id);
3653 }
3654
3655 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3656                              const char *name)
3657 {
3658         LLVMBuilderRef builder = ac->builder;
3659         LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3660         LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3661         LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3662         LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3663         LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3664         LLVMValueRef res;
3665
3666         if (first_instr) {
3667                 LLVMPositionBuilderBefore(first_builder, first_instr);
3668         } else {
3669                 LLVMPositionBuilderAtEnd(first_builder, first_block);
3670         }
3671
3672         res = LLVMBuildAlloca(first_builder, type, name);
3673         LLVMDisposeBuilder(first_builder);
3674         return res;
3675 }
3676
3677 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3678                                    LLVMTypeRef type, const char *name)
3679 {
3680         LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3681         LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3682         return ptr;
3683 }
3684
3685 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3686                          LLVMTypeRef type)
3687 {
3688         int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3689         return LLVMBuildBitCast(ctx->builder, ptr,
3690                                 LLVMPointerType(type, addr_space), "");
3691 }
3692
3693 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3694                             unsigned count)
3695 {
3696         unsigned num_components = ac_get_llvm_num_components(value);
3697         if (count == num_components)
3698                 return value;
3699
3700         LLVMValueRef masks[MAX2(count, 2)];
3701         masks[0] = ctx->i32_0;
3702         masks[1] = ctx->i32_1;
3703         for (unsigned i = 2; i < count; i++)
3704                 masks[i] = LLVMConstInt(ctx->i32, i, false);
3705
3706         if (count == 1)
3707                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3708                                                "");
3709
3710         LLVMValueRef swizzle = LLVMConstVector(masks, count);
3711         return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3712 }
3713
3714 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3715                              unsigned rshift, unsigned bitwidth)
3716 {
3717         LLVMValueRef value = param;
3718         if (rshift)
3719                 value = LLVMBuildLShr(ctx->builder, value,
3720                                       LLVMConstInt(ctx->i32, rshift, false), "");
3721
3722         if (rshift + bitwidth < 32) {
3723                 unsigned mask = (1 << bitwidth) - 1;
3724                 value = LLVMBuildAnd(ctx->builder, value,
3725                                      LLVMConstInt(ctx->i32, mask, false), "");
3726         }
3727         return value;
3728 }
3729
3730 /* Adjust the sample index according to FMASK.
3731  *
3732  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3733  * which is the identity mapping. Each nibble says which physical sample
3734  * should be fetched to get that sample.
3735  *
3736  * For example, 0x11111100 means there are only 2 samples stored and
3737  * the second sample covers 3/4 of the pixel. When reading samples 0
3738  * and 1, return physical sample 0 (determined by the first two 0s
3739  * in FMASK), otherwise return physical sample 1.
3740  *
3741  * The sample index should be adjusted as follows:
3742  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3743  */
3744 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3745                               LLVMValueRef *addr, bool is_array_tex)
3746 {
3747         struct ac_image_args fmask_load = {};
3748         fmask_load.opcode = ac_image_load;
3749         fmask_load.resource = fmask;
3750         fmask_load.dmask = 0xf;
3751         fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3752         fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3753
3754         fmask_load.coords[0] = addr[0];
3755         fmask_load.coords[1] = addr[1];
3756         if (is_array_tex)
3757                 fmask_load.coords[2] = addr[2];
3758
3759         LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3760         fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3761                                               ac->i32_0, "");
3762
3763         /* Apply the formula. */
3764         unsigned sample_chan = is_array_tex ? 3 : 2;
3765         LLVMValueRef final_sample;
3766         final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3767                                     LLVMConstInt(ac->i32, 4, 0), "");
3768         final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3769         /* Mask the sample index by 0x7, because 0x8 means an unknown value
3770          * with EQAA, so those will map to 0. */
3771         final_sample = LLVMBuildAnd(ac->builder, final_sample,
3772                                     LLVMConstInt(ac->i32, 0x7, 0), "");
3773
3774         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3775          * resource descriptor is 0 (invalid).
3776          */
3777         LLVMValueRef tmp;
3778         tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3779         tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3780         tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3781
3782         /* Replace the MSAA sample index. */
3783         addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3784                                             addr[sample_chan], "");
3785 }
3786
3787 static LLVMValueRef
3788 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3789 {
3790         ac_build_optimization_barrier(ctx, &src);
3791         return ac_build_intrinsic(ctx,
3792                         lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3793                         LLVMTypeOf(src), (LLVMValueRef []) {
3794                         src, lane },
3795                         lane == NULL ? 1 : 2,
3796                         AC_FUNC_ATTR_READNONE |
3797                         AC_FUNC_ATTR_CONVERGENT);
3798 }
3799
3800 /**
3801  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3802  * @param ctx
3803  * @param src
3804  * @param lane - id of the lane or NULL for the first active lane
3805  * @return value of the lane
3806  */
3807 LLVMValueRef
3808 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3809 {
3810         LLVMTypeRef src_type = LLVMTypeOf(src);
3811         src = ac_to_integer(ctx, src);
3812         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3813         LLVMValueRef ret;
3814
3815         if (bits == 32) {
3816                 ret = _ac_build_readlane(ctx, src, lane);
3817         } else {
3818                 assert(bits % 32 == 0);
3819                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3820                 LLVMValueRef src_vector =
3821                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3822                 ret = LLVMGetUndef(vec_type);
3823                 for (unsigned i = 0; i < bits / 32; i++) {
3824                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3825                                                 LLVMConstInt(ctx->i32, i, 0), "");
3826                         LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3827                         ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3828                                                 LLVMConstInt(ctx->i32, i, 0), "");
3829                 }
3830         }
3831         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3832 }
3833
3834 LLVMValueRef
3835 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3836 {
3837         if (HAVE_LLVM >= 0x0800) {
3838                 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3839                                           (LLVMValueRef []) {value, lane, src}, 3,
3840                                           AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3841         }
3842
3843         LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
3844                                           ac_get_thread_id(ctx), "");
3845         return LLVMBuildSelect(ctx->builder, pred, value, src, "");
3846 }
3847
3848 LLVMValueRef
3849 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3850 {
3851         LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3852                                                  LLVMVectorType(ctx->i32, 2),
3853                                                  "");
3854         LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3855                                                        ctx->i32_0, "");
3856         LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3857                                                        ctx->i32_1, "");
3858         LLVMValueRef val =
3859                 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3860                                    (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3861                                    2, AC_FUNC_ATTR_READNONE);
3862         val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3863                                  (LLVMValueRef []) { mask_hi, val },
3864                                  2, AC_FUNC_ATTR_READNONE);
3865         return val;
3866 }
3867
3868 enum dpp_ctrl {
3869         _dpp_quad_perm = 0x000,
3870         _dpp_row_sl = 0x100,
3871         _dpp_row_sr = 0x110,
3872         _dpp_row_rr = 0x120,
3873         dpp_wf_sl1 = 0x130,
3874         dpp_wf_rl1 = 0x134,
3875         dpp_wf_sr1 = 0x138,
3876         dpp_wf_rr1 = 0x13C,
3877         dpp_row_mirror = 0x140,
3878         dpp_row_half_mirror = 0x141,
3879         dpp_row_bcast15 = 0x142,
3880         dpp_row_bcast31 = 0x143
3881 };
3882
3883 static inline enum dpp_ctrl
3884 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3885 {
3886         assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3887         return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3888 }
3889
3890 static inline enum dpp_ctrl
3891 dpp_row_sl(unsigned amount)
3892 {
3893         assert(amount > 0 && amount < 16);
3894         return _dpp_row_sl | amount;
3895 }
3896
3897 static inline enum dpp_ctrl
3898 dpp_row_sr(unsigned amount)
3899 {
3900         assert(amount > 0 && amount < 16);
3901         return _dpp_row_sr | amount;
3902 }
3903
3904 static LLVMValueRef
3905 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3906               enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3907               bool bound_ctrl)
3908 {
3909         return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3910                                         LLVMTypeOf(old),
3911                                         (LLVMValueRef[]) {
3912                                                 old, src,
3913                                                 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3914                                                 LLVMConstInt(ctx->i32, row_mask, 0),
3915                                                 LLVMConstInt(ctx->i32, bank_mask, 0),
3916                                                 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3917                                         6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3918 }
3919
3920 static LLVMValueRef
3921 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3922              enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3923              bool bound_ctrl)
3924 {
3925         LLVMTypeRef src_type = LLVMTypeOf(src);
3926         src = ac_to_integer(ctx, src);
3927         old = ac_to_integer(ctx, old);
3928         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3929         LLVMValueRef ret;
3930         if (bits == 32) {
3931                 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3932                                     bank_mask, bound_ctrl);
3933         } else {
3934                 assert(bits % 32 == 0);
3935                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3936                 LLVMValueRef src_vector =
3937                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3938                 LLVMValueRef old_vector =
3939                         LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3940                 ret = LLVMGetUndef(vec_type);
3941                 for (unsigned i = 0; i < bits / 32; i++) {
3942                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3943                                                       LLVMConstInt(ctx->i32, i,
3944                                                                    0), "");
3945                         old = LLVMBuildExtractElement(ctx->builder, old_vector,
3946                                                       LLVMConstInt(ctx->i32, i,
3947                                                                    0), "");
3948                         LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3949                                                               dpp_ctrl,
3950                                                               row_mask,
3951                                                               bank_mask,
3952                                                               bound_ctrl);
3953                         ret = LLVMBuildInsertElement(ctx->builder, ret,
3954                                                      ret_comp,
3955                                                      LLVMConstInt(ctx->i32, i,
3956                                                                   0), "");
3957                 }
3958         }
3959         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3960 }
3961
3962 static LLVMValueRef
3963 _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3964                      bool exchange_rows, bool bound_ctrl)
3965 {
3966         LLVMValueRef args[6] = {
3967                 src,
3968                 src,
3969                 LLVMConstInt(ctx->i32, sel, false),
3970                 LLVMConstInt(ctx->i32, sel >> 32, false),
3971                 ctx->i1true, /* fi */
3972                 bound_ctrl ? ctx->i1true : ctx->i1false,
3973         };
3974         return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
3975                                                      : "llvm.amdgcn.permlane16",
3976                                   ctx->i32, args, 6,
3977                                   AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3978 }
3979
3980 static LLVMValueRef
3981 ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3982                     bool exchange_rows, bool bound_ctrl)
3983 {
3984         LLVMTypeRef src_type = LLVMTypeOf(src);
3985         src = ac_to_integer(ctx, src);
3986         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3987         LLVMValueRef ret;
3988         if (bits == 32) {
3989                 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
3990                                            bound_ctrl);
3991         } else {
3992                 assert(bits % 32 == 0);
3993                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3994                 LLVMValueRef src_vector =
3995                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3996                 ret = LLVMGetUndef(vec_type);
3997                 for (unsigned i = 0; i < bits / 32; i++) {
3998                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3999                                                       LLVMConstInt(ctx->i32, i,
4000                                                                    0), "");
4001                         LLVMValueRef ret_comp =
4002                                 _ac_build_permlane16(ctx, src, sel,
4003                                                      exchange_rows,
4004                                                      bound_ctrl);
4005                         ret = LLVMBuildInsertElement(ctx->builder, ret,
4006                                                      ret_comp,
4007                                                      LLVMConstInt(ctx->i32, i,
4008                                                                   0), "");
4009                 }
4010         }
4011         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4012 }
4013
4014 static inline unsigned
4015 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
4016 {
4017         assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
4018         return and_mask | (or_mask << 5) | (xor_mask << 10);
4019 }
4020
4021 static LLVMValueRef
4022 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4023 {
4024         return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
4025                                    LLVMTypeOf(src), (LLVMValueRef []) {
4026                                         src, LLVMConstInt(ctx->i32, mask, 0) },
4027                                    2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4028 }
4029
4030 LLVMValueRef
4031 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4032 {
4033         LLVMTypeRef src_type = LLVMTypeOf(src);
4034         src = ac_to_integer(ctx, src);
4035         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
4036         LLVMValueRef ret;
4037         if (bits == 32) {
4038                 ret = _ac_build_ds_swizzle(ctx, src, mask);
4039         } else {
4040                 assert(bits % 32 == 0);
4041                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
4042                 LLVMValueRef src_vector =
4043                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
4044                 ret = LLVMGetUndef(vec_type);
4045                 for (unsigned i = 0; i < bits / 32; i++) {
4046                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
4047                                                       LLVMConstInt(ctx->i32, i,
4048                                                                    0), "");
4049                         LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
4050                                                                      mask);
4051                         ret = LLVMBuildInsertElement(ctx->builder, ret,
4052                                                      ret_comp,
4053                                                      LLVMConstInt(ctx->i32, i,
4054                                                                   0), "");
4055                 }
4056         }
4057         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4058 }
4059
4060 static LLVMValueRef
4061 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
4062 {
4063         char name[32], type[8];
4064         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4065         snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
4066         return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
4067                                   (LLVMValueRef []) { src }, 1,
4068                                   AC_FUNC_ATTR_READNONE);
4069 }
4070
4071 static LLVMValueRef
4072 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
4073                       LLVMValueRef inactive)
4074 {
4075         char name[33], type[8];
4076         LLVMTypeRef src_type = LLVMTypeOf(src);
4077         src = ac_to_integer(ctx, src);
4078         inactive = ac_to_integer(ctx, inactive);
4079         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4080         snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
4081         LLVMValueRef ret =
4082                 ac_build_intrinsic(ctx, name,
4083                                         LLVMTypeOf(src), (LLVMValueRef []) {
4084                                         src, inactive }, 2,
4085                                         AC_FUNC_ATTR_READNONE |
4086                                         AC_FUNC_ATTR_CONVERGENT);
4087         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4088 }
4089
4090 static LLVMValueRef
4091 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
4092 {
4093         if (type_size == 4) {
4094                 switch (op) {
4095                 case nir_op_iadd: return ctx->i32_0;
4096                 case nir_op_fadd: return ctx->f32_0;
4097                 case nir_op_imul: return ctx->i32_1;
4098                 case nir_op_fmul: return ctx->f32_1;
4099                 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
4100                 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
4101                 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
4102                 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
4103                 case nir_op_umax: return ctx->i32_0;
4104                 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
4105                 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
4106                 case nir_op_ior: return ctx->i32_0;
4107                 case nir_op_ixor: return ctx->i32_0;
4108                 default:
4109                         unreachable("bad reduction intrinsic");
4110                 }
4111         } else { /* type_size == 64bit */
4112                 switch (op) {
4113                 case nir_op_iadd: return ctx->i64_0;
4114                 case nir_op_fadd: return ctx->f64_0;
4115                 case nir_op_imul: return ctx->i64_1;
4116                 case nir_op_fmul: return ctx->f64_1;
4117                 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
4118                 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4119                 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
4120                 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
4121                 case nir_op_umax: return ctx->i64_0;
4122                 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
4123                 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
4124                 case nir_op_ior: return ctx->i64_0;
4125                 case nir_op_ixor: return ctx->i64_0;
4126                 default:
4127                         unreachable("bad reduction intrinsic");
4128                 }
4129         }
4130 }
4131
4132 static LLVMValueRef
4133 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
4134 {
4135         bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
4136         switch (op) {
4137         case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
4138         case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
4139         case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
4140         case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
4141         case nir_op_imin: return LLVMBuildSelect(ctx->builder,
4142                                         LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
4143                                         lhs, rhs, "");
4144         case nir_op_umin: return LLVMBuildSelect(ctx->builder,
4145                                         LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
4146                                         lhs, rhs, "");
4147         case nir_op_fmin: return ac_build_intrinsic(ctx,
4148                                         _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
4149                                         _64bit ? ctx->f64 : ctx->f32,
4150                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4151         case nir_op_imax: return LLVMBuildSelect(ctx->builder,
4152                                         LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
4153                                         lhs, rhs, "");
4154         case nir_op_umax: return LLVMBuildSelect(ctx->builder,
4155                                         LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
4156                                         lhs, rhs, "");
4157         case nir_op_fmax: return ac_build_intrinsic(ctx,
4158                                         _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
4159                                         _64bit ? ctx->f64 : ctx->f32,
4160                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4161         case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
4162         case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
4163         case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
4164         default:
4165                 unreachable("bad reduction intrinsic");
4166         }
4167 }
4168
4169 /**
4170  * \param maxprefix specifies that the result only needs to be correct for a
4171  *     prefix of this many threads
4172  *
4173  * TODO: add inclusive and excluse scan functions for GFX6.
4174  */
4175 static LLVMValueRef
4176 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
4177               unsigned maxprefix, bool inclusive)
4178 {
4179         LLVMValueRef result, tmp;
4180
4181         if (ctx->chip_class >= GFX10) {
4182                 result = inclusive ? src : identity;
4183         } else {
4184                 if (inclusive)
4185                         result = src;
4186                 else
4187                         result = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
4188         }
4189         if (maxprefix <= 1)
4190                 return result;
4191         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
4192         result = ac_build_alu_op(ctx, result, tmp, op);
4193         if (maxprefix <= 2)
4194                 return result;
4195         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
4196         result = ac_build_alu_op(ctx, result, tmp, op);
4197         if (maxprefix <= 3)
4198                 return result;
4199         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
4200         result = ac_build_alu_op(ctx, result, tmp, op);
4201         if (maxprefix <= 4)
4202                 return result;
4203         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4204         result = ac_build_alu_op(ctx, result, tmp, op);
4205         if (maxprefix <= 8)
4206                 return result;
4207         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4208         result = ac_build_alu_op(ctx, result, tmp, op);
4209         if (maxprefix <= 16)
4210                 return result;
4211
4212         if (ctx->chip_class >= GFX10) {
4213                 /* dpp_row_bcast{15,31} are not supported on gfx10. */
4214                 LLVMBuilderRef builder = ctx->builder;
4215                 LLVMValueRef tid = ac_get_thread_id(ctx);
4216                 LLVMValueRef cc;
4217                 /* TODO-GFX10: Can we get better code-gen by putting this into
4218                  * a branch so that LLVM generates EXEC mask manipulations? */
4219                 if (inclusive)
4220                         tmp = result;
4221                 else
4222                         tmp = ac_build_alu_op(ctx, result, src, op);
4223                 tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
4224                 tmp = ac_build_alu_op(ctx, result, tmp, op);
4225                 cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
4226                 cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
4227                 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4228                 if (maxprefix <= 32)
4229                         return result;
4230
4231                 if (inclusive)
4232                         tmp = result;
4233                 else
4234                         tmp = ac_build_alu_op(ctx, result, src, op);
4235                 tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
4236                 tmp = ac_build_alu_op(ctx, result, tmp, op);
4237                 cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
4238                                    LLVMConstInt(ctx->i32, 32, false), "");
4239                 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4240                 return result;
4241         }
4242
4243         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4244         result = ac_build_alu_op(ctx, result, tmp, op);
4245         if (maxprefix <= 32)
4246                 return result;
4247         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4248         result = ac_build_alu_op(ctx, result, tmp, op);
4249         return result;
4250 }
4251
4252 LLVMValueRef
4253 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4254 {
4255         LLVMValueRef result;
4256
4257         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4258                 LLVMBuilderRef builder = ctx->builder;
4259                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4260                 result = ac_build_ballot(ctx, src);
4261                 result = ac_build_mbcnt(ctx, result);
4262                 result = LLVMBuildAdd(builder, result, src, "");
4263                 return result;
4264         }
4265
4266         ac_build_optimization_barrier(ctx, &src);
4267
4268         LLVMValueRef identity =
4269                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4270         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4271                                   LLVMTypeOf(identity), "");
4272         result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4273
4274         return ac_build_wwm(ctx, result);
4275 }
4276
4277 LLVMValueRef
4278 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4279 {
4280         LLVMValueRef result;
4281
4282         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4283                 LLVMBuilderRef builder = ctx->builder;
4284                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4285                 result = ac_build_ballot(ctx, src);
4286                 result = ac_build_mbcnt(ctx, result);
4287                 return result;
4288         }
4289
4290         ac_build_optimization_barrier(ctx, &src);
4291
4292         LLVMValueRef identity =
4293                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4294         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4295                                   LLVMTypeOf(identity), "");
4296         result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4297
4298         return ac_build_wwm(ctx, result);
4299 }
4300
4301 LLVMValueRef
4302 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
4303 {
4304         if (cluster_size == 1) return src;
4305         ac_build_optimization_barrier(ctx, &src);
4306         LLVMValueRef result, swap;
4307         LLVMValueRef identity = get_reduction_identity(ctx, op,
4308                                                                 ac_get_type_size(LLVMTypeOf(src)));
4309         result = LLVMBuildBitCast(ctx->builder,
4310                                                                 ac_build_set_inactive(ctx, src, identity),
4311                                                                 LLVMTypeOf(identity), "");
4312         swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4313         result = ac_build_alu_op(ctx, result, swap, op);
4314         if (cluster_size == 2) return ac_build_wwm(ctx, result);
4315
4316         swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4317         result = ac_build_alu_op(ctx, result, swap, op);
4318         if (cluster_size == 4) return ac_build_wwm(ctx, result);
4319
4320         if (ctx->chip_class >= GFX8)
4321                 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4322         else
4323                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4324         result = ac_build_alu_op(ctx, result, swap, op);
4325         if (cluster_size == 8) return ac_build_wwm(ctx, result);
4326
4327         if (ctx->chip_class >= GFX8)
4328                 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4329         else
4330                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4331         result = ac_build_alu_op(ctx, result, swap, op);
4332         if (cluster_size == 16) return ac_build_wwm(ctx, result);
4333
4334         if (ctx->chip_class >= GFX10)
4335                 swap = ac_build_permlane16(ctx, result, 0, true, false);
4336         else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4337                 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4338         else
4339                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4340         result = ac_build_alu_op(ctx, result, swap, op);
4341         if (cluster_size == 32) return ac_build_wwm(ctx, result);
4342
4343         if (ctx->chip_class >= GFX8) {
4344                 if (ctx->chip_class >= GFX10)
4345                         swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4346                 else
4347                         swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4348                 result = ac_build_alu_op(ctx, result, swap, op);
4349                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4350                 return ac_build_wwm(ctx, result);
4351         } else {
4352                 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4353                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4354                 result = ac_build_alu_op(ctx, result, swap, op);
4355                 return ac_build_wwm(ctx, result);
4356         }
4357 }
4358
4359 /**
4360  * "Top half" of a scan that reduces per-wave values across an entire
4361  * workgroup.
4362  *
4363  * The source value must be present in the highest lane of the wave, and the
4364  * highest lane must be live.
4365  */
4366 void
4367 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4368 {
4369         if (ws->maxwaves <= 1)
4370                 return;
4371
4372         const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4373         LLVMBuilderRef builder = ctx->builder;
4374         LLVMValueRef tid = ac_get_thread_id(ctx);
4375         LLVMValueRef tmp;
4376
4377         tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4378         ac_build_ifcc(ctx, tmp, 1000);
4379         LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4380         ac_build_endif(ctx, 1000);
4381 }
4382
4383 /**
4384  * "Bottom half" of a scan that reduces per-wave values across an entire
4385  * workgroup.
4386  *
4387  * The caller must place a barrier between the top and bottom halves.
4388  */
4389 void
4390 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4391 {
4392         const LLVMTypeRef type = LLVMTypeOf(ws->src);
4393         const LLVMValueRef identity =
4394                 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4395
4396         if (ws->maxwaves <= 1) {
4397                 ws->result_reduce = ws->src;
4398                 ws->result_inclusive = ws->src;
4399                 ws->result_exclusive = identity;
4400                 return;
4401         }
4402         assert(ws->maxwaves <= 32);
4403
4404         LLVMBuilderRef builder = ctx->builder;
4405         LLVMValueRef tid = ac_get_thread_id(ctx);
4406         LLVMBasicBlockRef bbs[2];
4407         LLVMValueRef phivalues_scan[2];
4408         LLVMValueRef tmp, tmp2;
4409
4410         bbs[0] = LLVMGetInsertBlock(builder);
4411         phivalues_scan[0] = LLVMGetUndef(type);
4412
4413         if (ws->enable_reduce)
4414                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4415         else if (ws->enable_inclusive)
4416                 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4417         else
4418                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4419         ac_build_ifcc(ctx, tmp, 1001);
4420         {
4421                 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4422
4423                 ac_build_optimization_barrier(ctx, &tmp);
4424
4425                 bbs[1] = LLVMGetInsertBlock(builder);
4426                 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4427         }
4428         ac_build_endif(ctx, 1001);
4429
4430         const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4431
4432         if (ws->enable_reduce) {
4433                 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4434                 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4435         }
4436         if (ws->enable_inclusive)
4437                 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4438         if (ws->enable_exclusive) {
4439                 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4440                 tmp = ac_build_readlane(ctx, scan, tmp);
4441                 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4442                 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4443         }
4444 }
4445
4446 /**
4447  * Inclusive scan of a per-wave value across an entire workgroup.
4448  *
4449  * This implies an s_barrier instruction.
4450  *
4451  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4452  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4453  * useful manner because of the barrier in the algorithm.)
4454  */
4455 void
4456 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4457 {
4458         ac_build_wg_wavescan_top(ctx, ws);
4459         ac_build_s_barrier(ctx);
4460         ac_build_wg_wavescan_bottom(ctx, ws);
4461 }
4462
4463 /**
4464  * "Top half" of a scan that reduces per-thread values across an entire
4465  * workgroup.
4466  *
4467  * All lanes must be active when this code runs.
4468  */
4469 void
4470 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4471 {
4472         if (ws->enable_exclusive) {
4473                 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4474                 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4475                         ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4476                 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4477         } else {
4478                 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4479         }
4480
4481         bool enable_inclusive = ws->enable_inclusive;
4482         bool enable_exclusive = ws->enable_exclusive;
4483         ws->enable_inclusive = false;
4484         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4485         ac_build_wg_wavescan_top(ctx, ws);
4486         ws->enable_inclusive = enable_inclusive;
4487         ws->enable_exclusive = enable_exclusive;
4488 }
4489
4490 /**
4491  * "Bottom half" of a scan that reduces per-thread values across an entire
4492  * workgroup.
4493  *
4494  * The caller must place a barrier between the top and bottom halves.
4495  */
4496 void
4497 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4498 {
4499         bool enable_inclusive = ws->enable_inclusive;
4500         bool enable_exclusive = ws->enable_exclusive;
4501         ws->enable_inclusive = false;
4502         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4503         ac_build_wg_wavescan_bottom(ctx, ws);
4504         ws->enable_inclusive = enable_inclusive;
4505         ws->enable_exclusive = enable_exclusive;
4506
4507         /* ws->result_reduce is already the correct value */
4508         if (ws->enable_inclusive)
4509                 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4510         if (ws->enable_exclusive)
4511                 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4512 }
4513
4514 /**
4515  * A scan that reduces per-thread values across an entire workgroup.
4516  *
4517  * The caller must ensure that all lanes are active when this code runs
4518  * (WWM is insufficient!), because there is an implied barrier.
4519  */
4520 void
4521 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4522 {
4523         ac_build_wg_scan_top(ctx, ws);
4524         ac_build_s_barrier(ctx);
4525         ac_build_wg_scan_bottom(ctx, ws);
4526 }
4527
4528 LLVMValueRef
4529 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
4530                 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
4531 {
4532         unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4533         if (ctx->chip_class >= GFX8) {
4534                 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4535         } else {
4536                 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4537         }
4538 }
4539
4540 LLVMValueRef
4541 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4542 {
4543         index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4544         return ac_build_intrinsic(ctx,
4545                   "llvm.amdgcn.ds.bpermute", ctx->i32,
4546                   (LLVMValueRef []) {index, src}, 2,
4547                   AC_FUNC_ATTR_READNONE |
4548                   AC_FUNC_ATTR_CONVERGENT);
4549 }
4550
4551 LLVMValueRef
4552 ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
4553                    unsigned bitsize)
4554 {
4555         LLVMTypeRef type;
4556         char *intr;
4557
4558         if (bitsize == 16) {
4559                 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4560                 type = ctx->i16;
4561         } else if (bitsize == 32) {
4562                 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4563                 type = ctx->i32;
4564         } else {
4565                 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4566                 type = ctx->i32;
4567         }
4568
4569         LLVMValueRef params[] = {
4570                 src0,
4571         };
4572         return ac_build_intrinsic(ctx, intr, type, params, 1,
4573                                   AC_FUNC_ATTR_READNONE);
4574 }
4575 LLVMValueRef
4576 ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
4577                     unsigned bitsize)
4578 {
4579         LLVMTypeRef type;
4580         char *intr;
4581
4582         if (bitsize == 16) {
4583                 intr = "llvm.amdgcn.frexp.mant.f16";
4584                 type = ctx->f16;
4585         } else if (bitsize == 32) {
4586                 intr = "llvm.amdgcn.frexp.mant.f32";
4587                 type = ctx->f32;
4588         } else {
4589                 intr = "llvm.amdgcn.frexp.mant.f64";
4590                 type = ctx->f64;
4591         }
4592
4593         LLVMValueRef params[] = {
4594                 src0,
4595         };
4596         return ac_build_intrinsic(ctx, intr, type, params, 1,
4597                                   AC_FUNC_ATTR_READNONE);
4598 }
4599
4600 /*
4601  * this takes an I,J coordinate pair,
4602  * and works out the X and Y derivatives.
4603  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4604  */
4605 LLVMValueRef
4606 ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4607 {
4608         LLVMValueRef result[4], a;
4609         unsigned i;
4610
4611         for (i = 0; i < 2; i++) {
4612                 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
4613                                             LLVMConstInt(ctx->i32, i, false), "");
4614                 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4615                 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4616         }
4617         return ac_build_gather_values(ctx, result, 4);
4618 }
4619
4620 LLVMValueRef
4621 ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4622 {
4623         LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
4624                                                  ctx->i1, NULL, 0,
4625                                                  AC_FUNC_ATTR_READNONE);
4626         result = LLVMBuildNot(ctx->builder, result, "");
4627         return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4628 }
4629
4630 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
4631                            LLVMValueRef *args, unsigned num_args)
4632 {
4633         LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4634         LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4635         return ret;
4636 }