src/amd/common/ac_llvm_build.c

   1 /*
   2  * Copyright 2014 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sub license, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  19  *
  20  * The above copyright notice and this permission notice (including the
  21  * next paragraph) shall be included in all copies or substantial portions
  22  * of the Software.
  23  *
  24  */
  25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
  26 #include "ac_llvm_build.h"
  27
  28 #include <llvm-c/Core.h>
  29
  30 #include "c11/threads.h"
  31
  32 #include <assert.h>
  33 #include <stdio.h>
  34
  35 #include "ac_llvm_util.h"
  36 #include "ac_exp_param.h"
  37 #include "util/bitscan.h"
  38 #include "util/macros.h"
  39 #include "util/u_atomic.h"
  40 #include "util/u_math.h"
  41 #include "sid.h"
  42
  43 #include "shader_enums.h"
  44
  45 #define AC_LLVM_INITIAL_CF_DEPTH 4
  46
  47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
  48  */
  49 struct ac_llvm_flow {
  50         /* Loop exit or next part of if/else/endif. */
  51         LLVMBasicBlockRef next_block;
  52         LLVMBasicBlockRef loop_entry_block;
  53 };
  54
  55 /* Initialize module-independent parts of the context.
  56  *
  57  * The caller is responsible for initializing ctx::module and ctx::builder.
  58  */
  59 void
  60 ac_llvm_context_init(struct ac_llvm_context *ctx,
  61                      enum chip_class chip_class, enum radeon_family family)
  62 {
  63         LLVMValueRef args[1];
  64
  65         ctx->context = LLVMContextCreate();
  66
  67         ctx->chip_class = chip_class;
  68         ctx->family = family;
  69         ctx->module = NULL;
  70         ctx->builder = NULL;
  71
  72         ctx->voidt = LLVMVoidTypeInContext(ctx->context);
  73         ctx->i1 = LLVMInt1TypeInContext(ctx->context);
  74         ctx->i8 = LLVMInt8TypeInContext(ctx->context);
  75         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
  76         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
  77         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
  78         ctx->intptr = ctx->i32;
  79         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
  80         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
  81         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
  82         ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
  83         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
  84         ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
  85         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
  86         ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
  87         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  88         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  89
  90         ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
  91         ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
  92         ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
  93         ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
  94         ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
  95         ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
  96         ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
  97         ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
  98         ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
  99         ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
 100         ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
 101         ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 102         ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
 103         ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
 104
 105         ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
 106         ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
 107
 108         ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 109                                                      "range", 5);
 110
 111         ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 112                                                                "invariant.load", 14);
 113
 114         ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
 115
 116         args[0] = LLVMConstReal(ctx->f32, 2.5);
 117         ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
 118
 119         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 120                                                         "amdgpu.uniform", 14);
 121
 122         ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
 123 }
 124
 125 void
 126 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
 127 {
 128         free(ctx->flow);
 129         ctx->flow = NULL;
 130         ctx->flow_depth_max = 0;
 131 }
 132
 133 int
 134 ac_get_llvm_num_components(LLVMValueRef value)
 135 {
 136         LLVMTypeRef type = LLVMTypeOf(value);
 137         unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
 138                                       ? LLVMGetVectorSize(type)
 139                                       : 1;
 140         return num_components;
 141 }
 142
 143 LLVMValueRef
 144 ac_llvm_extract_elem(struct ac_llvm_context *ac,
 145                      LLVMValueRef value,
 146                      int index)
 147 {
 148         if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
 149                 assert(index == 0);
 150                 return value;
 151         }
 152
 153         return LLVMBuildExtractElement(ac->builder, value,
 154                                        LLVMConstInt(ac->i32, index, false), "");
 155 }
 156
 157 int
 158 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
 159 {
 160         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 161                 type = LLVMGetElementType(type);
 162
 163         if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
 164                 return LLVMGetIntTypeWidth(type);
 165
 166         if (type == ctx->f16)
 167                 return 16;
 168         if (type == ctx->f32)
 169                 return 32;
 170         if (type == ctx->f64)
 171                 return 64;
 172
 173         unreachable("Unhandled type kind in get_elem_bits");
 174 }
 175
 176 unsigned
 177 ac_get_type_size(LLVMTypeRef type)
 178 {
 179         LLVMTypeKind kind = LLVMGetTypeKind(type);
 180
 181         switch (kind) {
 182         case LLVMIntegerTypeKind:
 183                 return LLVMGetIntTypeWidth(type) / 8;
 184         case LLVMHalfTypeKind:
 185                 return 2;
 186         case LLVMFloatTypeKind:
 187                 return 4;
 188         case LLVMDoubleTypeKind:
 189                 return 8;
 190         case LLVMPointerTypeKind:
 191                 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
 192                         return 4;
 193                 return 8;
 194         case LLVMVectorTypeKind:
 195                 return LLVMGetVectorSize(type) *
 196                        ac_get_type_size(LLVMGetElementType(type));
 197         case LLVMArrayTypeKind:
 198                 return LLVMGetArrayLength(type) *
 199                        ac_get_type_size(LLVMGetElementType(type));
 200         default:
 201                 assert(0);
 202                 return 0;
 203         }
 204 }
 205
 206 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 207 {
 208         if (t == ctx->i8)
 209                 return ctx->i8;
 210         else if (t == ctx->f16 || t == ctx->i16)
 211                 return ctx->i16;
 212         else if (t == ctx->f32 || t == ctx->i32)
 213                 return ctx->i32;
 214         else if (t == ctx->f64 || t == ctx->i64)
 215                 return ctx->i64;
 216         else
 217                 unreachable("Unhandled integer size");
 218 }
 219
 220 LLVMTypeRef
 221 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 222 {
 223         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 224                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 225                 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
 226                                       LLVMGetVectorSize(t));
 227         }
 228         if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
 229                 switch (LLVMGetPointerAddressSpace(t)) {
 230                 case AC_ADDR_SPACE_GLOBAL:
 231                         return ctx->i64;
 232                 case AC_ADDR_SPACE_LDS:
 233                         return ctx->i32;
 234                 default:
 235                         unreachable("unhandled address space");
 236                 }
 237         }
 238         return to_integer_type_scalar(ctx, t);
 239 }
 240
 241 LLVMValueRef
 242 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 243 {
 244         LLVMTypeRef type = LLVMTypeOf(v);
 245         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
 246                 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 247         }
 248         return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 249 }
 250
 251 LLVMValueRef
 252 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
 253 {
 254         LLVMTypeRef type = LLVMTypeOf(v);
 255         if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
 256                 return v;
 257         return ac_to_integer(ctx, v);
 258 }
 259
 260 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 261 {
 262         if (t == ctx->i8)
 263                 return ctx->i8;
 264         else if (t == ctx->i16 || t == ctx->f16)
 265                 return ctx->f16;
 266         else if (t == ctx->i32 || t == ctx->f32)
 267                 return ctx->f32;
 268         else if (t == ctx->i64 || t == ctx->f64)
 269                 return ctx->f64;
 270         else
 271                 unreachable("Unhandled float size");
 272 }
 273
 274 LLVMTypeRef
 275 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
 276 {
 277         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
 278                 LLVMTypeRef elem_type = LLVMGetElementType(t);
 279                 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
 280                                       LLVMGetVectorSize(t));
 281         }
 282         return to_float_type_scalar(ctx, t);
 283 }
 284
 285 LLVMValueRef
 286 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
 287 {
 288         LLVMTypeRef type = LLVMTypeOf(v);
 289         return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
 290 }
 291
 292
 293 LLVMValueRef
 294 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
 295                    LLVMTypeRef return_type, LLVMValueRef *params,
 296                    unsigned param_count, unsigned attrib_mask)
 297 {
 298         LLVMValueRef function, call;
 299         bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 300
 301         function = LLVMGetNamedFunction(ctx->module, name);
 302         if (!function) {
 303                 LLVMTypeRef param_types[32], function_type;
 304                 unsigned i;
 305
 306                 assert(param_count <= 32);
 307
 308                 for (i = 0; i < param_count; ++i) {
 309                         assert(params[i]);
 310                         param_types[i] = LLVMTypeOf(params[i]);
 311                 }
 312                 function_type =
 313                     LLVMFunctionType(return_type, param_types, param_count, 0);
 314                 function = LLVMAddFunction(ctx->module, name, function_type);
 315
 316                 LLVMSetFunctionCallConv(function, LLVMCCallConv);
 317                 LLVMSetLinkage(function, LLVMExternalLinkage);
 318
 319                 if (!set_callsite_attrs)
 320                         ac_add_func_attributes(ctx->context, function, attrib_mask);
 321         }
 322
 323         call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
 324         if (set_callsite_attrs)
 325                 ac_add_func_attributes(ctx->context, call, attrib_mask);
 326         return call;
 327 }
 328
 329 /**
 330  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
 331  * intrinsic names).
 332  */
 333 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
 334 {
 335         LLVMTypeRef elem_type = type;
 336
 337         assert(bufsize >= 8);
 338
 339         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
 340                 int ret = snprintf(buf, bufsize, "v%u",
 341                                         LLVMGetVectorSize(type));
 342                 if (ret < 0) {
 343                         char *type_name = LLVMPrintTypeToString(type);
 344                         fprintf(stderr, "Error building type name for: %s\n",
 345                                 type_name);
 346                         return;
 347                 }
 348                 elem_type = LLVMGetElementType(type);
 349                 buf += ret;
 350                 bufsize -= ret;
 351         }
 352         switch (LLVMGetTypeKind(elem_type)) {
 353         default: break;
 354         case LLVMIntegerTypeKind:
 355                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 356                 break;
 357         case LLVMHalfTypeKind:
 358                 snprintf(buf, bufsize, "f16");
 359                 break;
 360         case LLVMFloatTypeKind:
 361                 snprintf(buf, bufsize, "f32");
 362                 break;
 363         case LLVMDoubleTypeKind:
 364                 snprintf(buf, bufsize, "f64");
 365                 break;
 366         }
 367 }
 368
 369 /**
 370  * Helper function that builds an LLVM IR PHI node and immediately adds
 371  * incoming edges.
 372  */
 373 LLVMValueRef
 374 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
 375              unsigned count_incoming, LLVMValueRef *values,
 376              LLVMBasicBlockRef *blocks)
 377 {
 378         LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
 379         LLVMAddIncoming(phi, values, blocks, count_incoming);
 380         return phi;
 381 }
 382
 383 void ac_build_s_barrier(struct ac_llvm_context *ctx)
 384 {
 385         ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
 386                            0, AC_FUNC_ATTR_CONVERGENT);
 387 }
 388
 389 /* Prevent optimizations (at least of memory accesses) across the current
 390  * point in the program by emitting empty inline assembly that is marked as
 391  * having side effects.
 392  *
 393  * Optionally, a value can be passed through the inline assembly to prevent
 394  * LLVM from hoisting calls to ReadNone functions.
 395  */
 396 void
 397 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
 398                               LLVMValueRef *pvgpr)
 399 {
 400         static int counter = 0;
 401
 402         LLVMBuilderRef builder = ctx->builder;
 403         char code[16];
 404
 405         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
 406
 407         if (!pvgpr) {
 408                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
 409                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
 410                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
 411         } else {
 412                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
 413                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
 414                 LLVMValueRef vgpr = *pvgpr;
 415                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
 416                 unsigned vgpr_size = ac_get_type_size(vgpr_type);
 417                 LLVMValueRef vgpr0;
 418
 419                 assert(vgpr_size % 4 == 0);
 420
 421                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
 422                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
 423                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
 424                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
 425                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
 426
 427                 *pvgpr = vgpr;
 428         }
 429 }
 430
 431 LLVMValueRef
 432 ac_build_shader_clock(struct ac_llvm_context *ctx)
 433 {
 434         LLVMValueRef tmp = ac_build_intrinsic(ctx, "llvm.readcyclecounter",
 435                                               ctx->i64, NULL, 0, 0);
 436         return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
 437 }
 438
 439 LLVMValueRef
 440 ac_build_ballot(struct ac_llvm_context *ctx,
 441                 LLVMValueRef value)
 442 {
 443         LLVMValueRef args[3] = {
 444                 value,
 445                 ctx->i32_0,
 446                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
 447         };
 448
 449         /* We currently have no other way to prevent LLVM from lifting the icmp
 450          * calls to a dominating basic block.
 451          */
 452         ac_build_optimization_barrier(ctx, &args[0]);
 453
 454         args[0] = ac_to_integer(ctx, args[0]);
 455
 456         return ac_build_intrinsic(ctx,
 457                                   "llvm.amdgcn.icmp.i32",
 458                                   ctx->i64, args, 3,
 459                                   AC_FUNC_ATTR_NOUNWIND |
 460                                   AC_FUNC_ATTR_READNONE |
 461                                   AC_FUNC_ATTR_CONVERGENT);
 462 }
 463
 464 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
 465                                  LLVMValueRef value)
 466 {
 467         LLVMValueRef args[3] = {
 468                 value,
 469                 ctx->i1false,
 470                 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
 471         };
 472
 473         assert(HAVE_LLVM >= 0x0800);
 474         return ac_build_intrinsic(ctx, "llvm.amdgcn.icmp.i1", ctx->i64, args, 3,
 475                                   AC_FUNC_ATTR_NOUNWIND |
 476                                   AC_FUNC_ATTR_READNONE |
 477                                   AC_FUNC_ATTR_CONVERGENT);
 478 }
 479
 480 LLVMValueRef
 481 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
 482 {
 483         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 484         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 485         return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
 486 }
 487
 488 LLVMValueRef
 489 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
 490 {
 491         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 492         return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
 493                              LLVMConstInt(ctx->i64, 0, 0), "");
 494 }
 495
 496 LLVMValueRef
 497 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
 498 {
 499         LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
 500         LLVMValueRef vote_set = ac_build_ballot(ctx, value);
 501
 502         LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 503                                          vote_set, active_set, "");
 504         LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 505                                           vote_set,
 506                                           LLVMConstInt(ctx->i64, 0, 0), "");
 507         return LLVMBuildOr(ctx->builder, all, none, "");
 508 }
 509
 510 LLVMValueRef
 511 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
 512                                unsigned value_count, unsigned component)
 513 {
 514         LLVMValueRef vec = NULL;
 515
 516         if (value_count == 1) {
 517                 return values[component];
 518         } else if (!value_count)
 519                 unreachable("value_count is 0");
 520
 521         for (unsigned i = component; i < value_count + component; i++) {
 522                 LLVMValueRef value = values[i];
 523
 524                 if (i == component)
 525                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 526                 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
 527                 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
 528         }
 529         return vec;
 530 }
 531
 532 LLVMValueRef
 533 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 534                                 LLVMValueRef *values,
 535                                 unsigned value_count,
 536                                 unsigned value_stride,
 537                                 bool load,
 538                                 bool always_vector)
 539 {
 540         LLVMBuilderRef builder = ctx->builder;
 541         LLVMValueRef vec = NULL;
 542         unsigned i;
 543
 544         if (value_count == 1 && !always_vector) {
 545                 if (load)
 546                         return LLVMBuildLoad(builder, values[0], "");
 547                 return values[0];
 548         } else if (!value_count)
 549                 unreachable("value_count is 0");
 550
 551         for (i = 0; i < value_count; i++) {
 552                 LLVMValueRef value = values[i * value_stride];
 553                 if (load)
 554                         value = LLVMBuildLoad(builder, value, "");
 555
 556                 if (!i)
 557                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 558                 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
 559                 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
 560         }
 561         return vec;
 562 }
 563
 564 LLVMValueRef
 565 ac_build_gather_values(struct ac_llvm_context *ctx,
 566                        LLVMValueRef *values,
 567                        unsigned value_count)
 568 {
 569         return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 570 }
 571
 572 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
 573  * channels with undef. Extract at most src_channels components from the input.
 574  */
 575 static LLVMValueRef
 576 ac_build_expand(struct ac_llvm_context *ctx,
 577                 LLVMValueRef value,
 578                 unsigned src_channels,
 579                 unsigned dst_channels)
 580 {
 581         LLVMTypeRef elemtype;
 582         LLVMValueRef chan[dst_channels];
 583
 584         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
 585                 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
 586
 587                 if (src_channels == dst_channels && vec_size == dst_channels)
 588                         return value;
 589
 590                 src_channels = MIN2(src_channels, vec_size);
 591
 592                 for (unsigned i = 0; i < src_channels; i++)
 593                         chan[i] = ac_llvm_extract_elem(ctx, value, i);
 594
 595                 elemtype = LLVMGetElementType(LLVMTypeOf(value));
 596         } else {
 597                 if (src_channels) {
 598                         assert(src_channels == 1);
 599                         chan[0] = value;
 600                 }
 601                 elemtype = LLVMTypeOf(value);
 602         }
 603
 604         for (unsigned i = src_channels; i < dst_channels; i++)
 605                 chan[i] = LLVMGetUndef(elemtype);
 606
 607         return ac_build_gather_values(ctx, chan, dst_channels);
 608 }
 609
 610 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
 611  * with undef. Extract at most num_channels components from the input.
 612  */
 613 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
 614                                      LLVMValueRef value,
 615                                      unsigned num_channels)
 616 {
 617         return ac_build_expand(ctx, value, num_channels, 4);
 618 }
 619
 620 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
 621 {
 622         unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
 623         const char *name;
 624
 625         if (type_size == 2)
 626                 name = "llvm.rint.f16";
 627         else if (type_size == 4)
 628                 name = "llvm.rint.f32";
 629         else
 630                 name = "llvm.rint.f64";
 631
 632         return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
 633                                   AC_FUNC_ATTR_READNONE);
 634 }
 635
 636 LLVMValueRef
 637 ac_build_fdiv(struct ac_llvm_context *ctx,
 638               LLVMValueRef num,
 639               LLVMValueRef den)
 640 {
 641         /* If we do (num / den), LLVM >= 7.0 does:
 642          *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
 643          *
 644          * If we do (num * (1 / den)), LLVM does:
 645          *    return num * v_rcp_f32(den);
 646          */
 647         LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
 648         LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
 649         LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 650
 651         /* Use v_rcp_f32 instead of precise division. */
 652         if (!LLVMIsConstant(ret))
 653                 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
 654         return ret;
 655 }
 656
 657 /* See fast_idiv_by_const.h. */
 658 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
 659 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
 660                                 LLVMValueRef num,
 661                                 LLVMValueRef multiplier,
 662                                 LLVMValueRef pre_shift,
 663                                 LLVMValueRef post_shift,
 664                                 LLVMValueRef increment)
 665 {
 666         LLVMBuilderRef builder = ctx->builder;
 667
 668         num = LLVMBuildLShr(builder, num, pre_shift, "");
 669         num = LLVMBuildMul(builder,
 670                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 671                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 672         num = LLVMBuildAdd(builder, num,
 673                            LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
 674         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 675         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 676         return LLVMBuildLShr(builder, num, post_shift, "");
 677 }
 678
 679 /* See fast_idiv_by_const.h. */
 680 /* If num != UINT_MAX, this more efficient version can be used. */
 681 /* Set: increment = util_fast_udiv_info::increment; */
 682 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
 683                                     LLVMValueRef num,
 684                                     LLVMValueRef multiplier,
 685                                     LLVMValueRef pre_shift,
 686                                     LLVMValueRef post_shift,
 687                                     LLVMValueRef increment)
 688 {
 689         LLVMBuilderRef builder = ctx->builder;
 690
 691         num = LLVMBuildLShr(builder, num, pre_shift, "");
 692         num = LLVMBuildNUWAdd(builder, num, increment, "");
 693         num = LLVMBuildMul(builder,
 694                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 695                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 696         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 697         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 698         return LLVMBuildLShr(builder, num, post_shift, "");
 699 }
 700
 701 /* See fast_idiv_by_const.h. */
 702 /* Both operands must fit in 31 bits and the divisor must not be 1. */
 703 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
 704                                               LLVMValueRef num,
 705                                               LLVMValueRef multiplier,
 706                                               LLVMValueRef post_shift)
 707 {
 708         LLVMBuilderRef builder = ctx->builder;
 709
 710         num = LLVMBuildMul(builder,
 711                            LLVMBuildZExt(builder, num, ctx->i64, ""),
 712                            LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
 713         num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
 714         num = LLVMBuildTrunc(builder, num, ctx->i32, "");
 715         return LLVMBuildLShr(builder, num, post_shift, "");
 716 }
 717
 718 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
 719  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
 720  * already multiplied by two. id is the cube face number.
 721  */
 722 struct cube_selection_coords {
 723         LLVMValueRef stc[2];
 724         LLVMValueRef ma;
 725         LLVMValueRef id;
 726 };
 727
 728 static void
 729 build_cube_intrinsic(struct ac_llvm_context *ctx,
 730                      LLVMValueRef in[3],
 731                      struct cube_selection_coords *out)
 732 {
 733         LLVMTypeRef f32 = ctx->f32;
 734
 735         out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
 736                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 737         out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
 738                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 739         out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
 740                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 741         out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
 742                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 743 }
 744
 745 /**
 746  * Build a manual selection sequence for cube face sc/tc coordinates and
 747  * major axis vector (multiplied by 2 for consistency) for the given
 748  * vec3 \p coords, for the face implied by \p selcoords.
 749  *
 750  * For the major axis, we always adjust the sign to be in the direction of
 751  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
 752  * the selcoords major axis.
 753  */
 754 static void build_cube_select(struct ac_llvm_context *ctx,
 755                               const struct cube_selection_coords *selcoords,
 756                               const LLVMValueRef *coords,
 757                               LLVMValueRef *out_st,
 758                               LLVMValueRef *out_ma)
 759 {
 760         LLVMBuilderRef builder = ctx->builder;
 761         LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
 762         LLVMValueRef is_ma_positive;
 763         LLVMValueRef sgn_ma;
 764         LLVMValueRef is_ma_z, is_not_ma_z;
 765         LLVMValueRef is_ma_y;
 766         LLVMValueRef is_ma_x;
 767         LLVMValueRef sgn;
 768         LLVMValueRef tmp;
 769
 770         is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
 771                 selcoords->ma, LLVMConstReal(f32, 0.0), "");
 772         sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
 773                 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
 774
 775         is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
 776         is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
 777         is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
 778                 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
 779         is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 780
 781         /* Select sc */
 782         tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
 783         sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
 784                 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
 785                         LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 786         out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 787
 788         /* Select tc */
 789         tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
 790         sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
 791                 LLVMConstReal(f32, -1.0), "");
 792         out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 793
 794         /* Select ma */
 795         tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
 796                 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
 797         tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 798                                  ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
 799         *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
 800 }
 801
 802 void
 803 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
 804                        bool is_deriv, bool is_array, bool is_lod,
 805                        LLVMValueRef *coords_arg,
 806                        LLVMValueRef *derivs_arg)
 807 {
 808
 809         LLVMBuilderRef builder = ctx->builder;
 810         struct cube_selection_coords selcoords;
 811         LLVMValueRef coords[3];
 812         LLVMValueRef invma;
 813
 814         if (is_array && !is_lod) {
 815                 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
 816
 817                 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
 818                  *
 819                  *    "For Array forms, the array layer used will be
 820                  *
 821                  *       max(0, min(d−1, floor(layer+0.5)))
 822                  *
 823                  *     where d is the depth of the texture array and layer
 824                  *     comes from the component indicated in the tables below.
 825                  *     Workaroudn for an issue where the layer is taken from a
 826                  *     helper invocation which happens to fall on a different
 827                  *     layer due to extrapolation."
 828                  *
 829                  * VI and earlier attempt to implement this in hardware by
 830                  * clamping the value of coords[2] = (8 * layer) + face.
 831                  * Unfortunately, this means that the we end up with the wrong
 832                  * face when clamping occurs.
 833                  *
 834                  * Clamp the layer earlier to work around the issue.
 835                  */
 836                 if (ctx->chip_class <= VI) {
 837                         LLVMValueRef ge0;
 838                         ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
 839                         tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
 840                 }
 841
 842                 coords_arg[3] = tmp;
 843         }
 844
 845         build_cube_intrinsic(ctx, coords_arg, &selcoords);
 846
 847         invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 848                         ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
 849         invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
 850
 851         for (int i = 0; i < 2; ++i)
 852                 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
 853
 854         coords[2] = selcoords.id;
 855
 856         if (is_deriv && derivs_arg) {
 857                 LLVMValueRef derivs[4];
 858                 int axis;
 859
 860                 /* Convert cube derivatives to 2D derivatives. */
 861                 for (axis = 0; axis < 2; axis++) {
 862                         LLVMValueRef deriv_st[2];
 863                         LLVMValueRef deriv_ma;
 864
 865                         /* Transform the derivative alongside the texture
 866                          * coordinate. Mathematically, the correct formula is
 867                          * as follows. Assume we're projecting onto the +Z face
 868                          * and denote by dx/dh the derivative of the (original)
 869                          * X texture coordinate with respect to horizontal
 870                          * window coordinates. The projection onto the +Z face
 871                          * plane is:
 872                          *
 873                          *   f(x,z) = x/z
 874                          *
 875                          * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
 876                          *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
 877                          *
 878                          * This motivatives the implementation below.
 879                          *
 880                          * Whether this actually gives the expected results for
 881                          * apps that might feed in derivatives obtained via
 882                          * finite differences is anyone's guess. The OpenGL spec
 883                          * seems awfully quiet about how textureGrad for cube
 884                          * maps should be handled.
 885                          */
 886                         build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
 887                                           deriv_st, &deriv_ma);
 888
 889                         deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
 890
 891                         for (int i = 0; i < 2; ++i)
 892                                 derivs[axis * 2 + i] =
 893                                         LLVMBuildFSub(builder,
 894                                                 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
 895                                                 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
 896                 }
 897
 898                 memcpy(derivs_arg, derivs, sizeof(derivs));
 899         }
 900
 901         /* Shift the texture coordinate. This must be applied after the
 902          * derivative calculation.
 903          */
 904         for (int i = 0; i < 2; ++i)
 905                 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
 906
 907         if (is_array) {
 908                 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 909                 /* coords_arg.w component - array_index for cube arrays */
 910                 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
 911         }
 912
 913         memcpy(coords_arg, coords, sizeof(coords));
 914 }
 915
 916
 917 LLVMValueRef
 918 ac_build_fs_interp(struct ac_llvm_context *ctx,
 919                    LLVMValueRef llvm_chan,
 920                    LLVMValueRef attr_number,
 921                    LLVMValueRef params,
 922                    LLVMValueRef i,
 923                    LLVMValueRef j)
 924 {
 925         LLVMValueRef args[5];
 926         LLVMValueRef p1;
 927
 928         args[0] = i;
 929         args[1] = llvm_chan;
 930         args[2] = attr_number;
 931         args[3] = params;
 932
 933         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
 934                                 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 935
 936         args[0] = p1;
 937         args[1] = j;
 938         args[2] = llvm_chan;
 939         args[3] = attr_number;
 940         args[4] = params;
 941
 942         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
 943                                   ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 944 }
 945
 946 LLVMValueRef
 947 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
 948                        LLVMValueRef llvm_chan,
 949                        LLVMValueRef attr_number,
 950                        LLVMValueRef params,
 951                        LLVMValueRef i,
 952                        LLVMValueRef j)
 953 {
 954         LLVMValueRef args[6];
 955         LLVMValueRef p1;
 956
 957         args[0] = i;
 958         args[1] = llvm_chan;
 959         args[2] = attr_number;
 960         args[3] = ctx->i1false;
 961         args[4] = params;
 962
 963         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
 964                                 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 965
 966         args[0] = p1;
 967         args[1] = j;
 968         args[2] = llvm_chan;
 969         args[3] = attr_number;
 970         args[4] = ctx->i1false;
 971         args[5] = params;
 972
 973         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
 974                                   ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
 975 }
 976
 977 LLVMValueRef
 978 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 979                        LLVMValueRef parameter,
 980                        LLVMValueRef llvm_chan,
 981                        LLVMValueRef attr_number,
 982                        LLVMValueRef params)
 983 {
 984         LLVMValueRef args[4];
 985
 986         args[0] = parameter;
 987         args[1] = llvm_chan;
 988         args[2] = attr_number;
 989         args[3] = params;
 990
 991         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
 992                                   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 993 }
 994
 995 LLVMValueRef
 996 ac_build_gep_ptr(struct ac_llvm_context *ctx,
 997                  LLVMValueRef base_ptr,
 998                  LLVMValueRef index)
 999 {
1000         return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1001 }
1002
1003 LLVMValueRef
1004 ac_build_gep0(struct ac_llvm_context *ctx,
1005               LLVMValueRef base_ptr,
1006               LLVMValueRef index)
1007 {
1008         LLVMValueRef indices[2] = {
1009                 ctx->i32_0,
1010                 index,
1011         };
1012         return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1013 }
1014
1015 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
1016                                   LLVMValueRef index)
1017 {
1018         return LLVMBuildPointerCast(ctx->builder,
1019                                     ac_build_gep0(ctx, ptr, index),
1020                                     LLVMTypeOf(ptr), "");
1021 }
1022
1023 void
1024 ac_build_indexed_store(struct ac_llvm_context *ctx,
1025                        LLVMValueRef base_ptr, LLVMValueRef index,
1026                        LLVMValueRef value)
1027 {
1028         LLVMBuildStore(ctx->builder, value,
1029                        ac_build_gep0(ctx, base_ptr, index));
1030 }
1031
1032 /**
1033  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1034  * It's equivalent to doing a load from &base_ptr[index].
1035  *
1036  * \param base_ptr  Where the array starts.
1037  * \param index     The element index into the array.
1038  * \param uniform   Whether the base_ptr and index can be assumed to be
1039  *                  dynamically uniform (i.e. load to an SGPR)
1040  * \param invariant Whether the load is invariant (no other opcodes affect it)
1041  * \param no_unsigned_wraparound
1042  *    For all possible re-associations and re-distributions of an expression
1043  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1044  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1045  *    does not result in an unsigned integer wraparound. This is used for
1046  *    optimal code generation of 32-bit pointer arithmetic.
1047  *
1048  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1049  *    integer wraparound can't be an imm offset in s_load_dword, because
1050  *    the instruction performs "addr + offset" in 64 bits.
1051  *
1052  *    Expected usage for bindless textures by chaining GEPs:
1053  *      // possible unsigned wraparound, don't use InBounds:
1054  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1055  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1056  *
1057  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1058  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1059  */
1060 static LLVMValueRef
1061 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1062                      LLVMValueRef index, bool uniform, bool invariant,
1063                      bool no_unsigned_wraparound)
1064 {
1065         LLVMValueRef pointer, result;
1066         LLVMValueRef indices[2] = {ctx->i32_0, index};
1067
1068         if (no_unsigned_wraparound &&
1069             LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1070                 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, "");
1071         else
1072                 pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1073
1074         if (uniform)
1075                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1076         result = LLVMBuildLoad(ctx->builder, pointer, "");
1077         if (invariant)
1078                 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1079         return result;
1080 }
1081
1082 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1083                            LLVMValueRef index)
1084 {
1085         return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1086 }
1087
1088 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1089                                      LLVMValueRef base_ptr, LLVMValueRef index)
1090 {
1091         return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1092 }
1093
1094 /* This assumes that there is no unsigned integer wraparound during the address
1095  * computation, excluding all GEPs within base_ptr. */
1096 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1097                                    LLVMValueRef base_ptr, LLVMValueRef index)
1098 {
1099         return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1100 }
1101
1102 /* See ac_build_load_custom() documentation. */
1103 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1104                                    LLVMValueRef base_ptr, LLVMValueRef index)
1105 {
1106         return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1107 }
1108
1109 static void
1110 ac_build_buffer_store_common(struct ac_llvm_context *ctx,
1111                              LLVMValueRef rsrc,
1112                              LLVMValueRef data,
1113                              LLVMValueRef vindex,
1114                              LLVMValueRef voffset,
1115                              unsigned num_channels,
1116                              bool glc,
1117                              bool slc,
1118                              bool writeonly_memory,
1119                              bool use_format)
1120 {
1121         LLVMValueRef args[] = {
1122                 data,
1123                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1124                 vindex ? vindex : ctx->i32_0,
1125                 voffset,
1126                 LLVMConstInt(ctx->i1, glc, 0),
1127                 LLVMConstInt(ctx->i1, slc, 0)
1128         };
1129         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1130
1131         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1132         char name[256];
1133
1134         if (use_format) {
1135                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
1136                          type_names[func]);
1137         } else {
1138                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
1139                          type_names[func]);
1140         }
1141
1142         ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
1143                            ac_get_store_intr_attribs(writeonly_memory));
1144 }
1145
1146 static void
1147 ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
1148                                    LLVMValueRef rsrc,
1149                                    LLVMValueRef data,
1150                                    LLVMValueRef vindex,
1151                                    LLVMValueRef voffset,
1152                                    LLVMValueRef soffset,
1153                                    unsigned num_channels,
1154                                    LLVMTypeRef return_channel_type,
1155                                    bool glc,
1156                                    bool slc,
1157                                    bool writeonly_memory,
1158                                    bool use_format,
1159                                    bool structurized)
1160 {
1161         LLVMValueRef args[6];
1162         int idx = 0;
1163         args[idx++] = data;
1164         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1165         if (structurized)
1166                 args[idx++] = vindex ? vindex : ctx->i32_0;
1167         args[idx++] = voffset ? voffset : ctx->i32_0;
1168         args[idx++] = soffset ? soffset : ctx->i32_0;
1169         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1170         unsigned func = num_channels == 3 ? 4 : num_channels;
1171         const char *indexing_kind = structurized ? "struct" : "raw";
1172         char name[256], type_name[8];
1173
1174         LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
1175         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1176
1177         if (use_format) {
1178                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1179                          indexing_kind, type_name);
1180         } else {
1181                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1182                          indexing_kind, type_name);
1183         }
1184
1185         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1186                            ac_get_store_intr_attribs(writeonly_memory));
1187 }
1188
1189 void
1190 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1191                              LLVMValueRef rsrc,
1192                              LLVMValueRef data,
1193                              LLVMValueRef vindex,
1194                              LLVMValueRef voffset,
1195                              unsigned num_channels,
1196                              bool glc,
1197                              bool writeonly_memory)
1198 {
1199         if (HAVE_LLVM >= 0x800) {
1200                 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
1201                                                    voffset, NULL, num_channels,
1202                                                    ctx->f32, glc, false,
1203                                                    writeonly_memory, true, true);
1204         } else {
1205                 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset,
1206                                              num_channels, glc, false,
1207                                              writeonly_memory, true);
1208         }
1209 }
1210
1211 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1212  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1213  * or v4i32 (num_channels=3,4).
1214  */
1215 void
1216 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1217                             LLVMValueRef rsrc,
1218                             LLVMValueRef vdata,
1219                             unsigned num_channels,
1220                             LLVMValueRef voffset,
1221                             LLVMValueRef soffset,
1222                             unsigned inst_offset,
1223                             bool glc,
1224                             bool slc,
1225                             bool writeonly_memory,
1226                             bool swizzle_enable_hint)
1227 {
1228         /* Split 3 channel stores, becase LLVM doesn't support 3-channel
1229          * intrinsics. */
1230         if (num_channels == 3) {
1231                 LLVMValueRef v[3], v01;
1232
1233                 for (int i = 0; i < 3; i++) {
1234                         v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1235                                         LLVMConstInt(ctx->i32, i, 0), "");
1236                 }
1237                 v01 = ac_build_gather_values(ctx, v, 2);
1238
1239                 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1240                                             soffset, inst_offset, glc, slc,
1241                                             writeonly_memory, swizzle_enable_hint);
1242                 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1243                                             soffset, inst_offset + 8,
1244                                             glc, slc,
1245                                             writeonly_memory, swizzle_enable_hint);
1246                 return;
1247         }
1248
1249         /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1250          * (voffset is swizzled, but soffset isn't swizzled).
1251          * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1252          */
1253         if (!swizzle_enable_hint) {
1254                 LLVMValueRef offset = soffset;
1255
1256                 if (inst_offset)
1257                         offset = LLVMBuildAdd(ctx->builder, offset,
1258                                               LLVMConstInt(ctx->i32, inst_offset, 0), "");
1259
1260                 if (HAVE_LLVM >= 0x800) {
1261                         ac_build_llvm8_buffer_store_common(ctx, rsrc,
1262                                                            ac_to_float(ctx, vdata),
1263                                                            ctx->i32_0,
1264                                                            voffset, offset,
1265                                                            num_channels,
1266                                                            ctx->f32,
1267                                                            glc, slc,
1268                                                            writeonly_memory,
1269                                                            false, false);
1270                 } else {
1271                         if (voffset)
1272                                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1273
1274                         ac_build_buffer_store_common(ctx, rsrc,
1275                                                      ac_to_float(ctx, vdata),
1276                                                      ctx->i32_0, offset,
1277                                                      num_channels, glc, slc,
1278                                                      writeonly_memory, false);
1279                 }
1280                 return;
1281         }
1282
1283         static const unsigned dfmts[] = {
1284                 V_008F0C_BUF_DATA_FORMAT_32,
1285                 V_008F0C_BUF_DATA_FORMAT_32_32,
1286                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1287                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1288         };
1289         unsigned dfmt = dfmts[num_channels - 1];
1290         unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1291         LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1292
1293         ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1294                                    immoffset, num_channels, dfmt, nfmt, glc,
1295                                    slc, writeonly_memory);
1296 }
1297
1298 static LLVMValueRef
1299 ac_build_buffer_load_common(struct ac_llvm_context *ctx,
1300                             LLVMValueRef rsrc,
1301                             LLVMValueRef vindex,
1302                             LLVMValueRef voffset,
1303                             unsigned num_channels,
1304                             bool glc,
1305                             bool slc,
1306                             bool can_speculate,
1307                             bool use_format)
1308 {
1309         LLVMValueRef args[] = {
1310                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1311                 vindex ? vindex : ctx->i32_0,
1312                 voffset,
1313                 LLVMConstInt(ctx->i1, glc, 0),
1314                 LLVMConstInt(ctx->i1, slc, 0)
1315         };
1316         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1317
1318         LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1319         const char *type_names[] = {"f32", "v2f32", "v4f32"};
1320         char name[256];
1321
1322         if (use_format) {
1323                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1324                          type_names[func]);
1325         } else {
1326                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1327                          type_names[func]);
1328         }
1329
1330         return ac_build_intrinsic(ctx, name, types[func], args,
1331                                   ARRAY_SIZE(args),
1332                                   ac_get_load_intr_attribs(can_speculate));
1333 }
1334
1335 static LLVMValueRef
1336 ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
1337                                   LLVMValueRef rsrc,
1338                                   LLVMValueRef vindex,
1339                                   LLVMValueRef voffset,
1340                                   LLVMValueRef soffset,
1341                                   unsigned num_channels,
1342                                   LLVMTypeRef channel_type,
1343                                   bool glc,
1344                                   bool slc,
1345                                   bool can_speculate,
1346                                   bool use_format,
1347                                   bool structurized)
1348 {
1349         LLVMValueRef args[5];
1350         int idx = 0;
1351         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1352         if (structurized)
1353                 args[idx++] = vindex ? vindex : ctx->i32_0;
1354         args[idx++] = voffset ? voffset : ctx->i32_0;
1355         args[idx++] = soffset ? soffset : ctx->i32_0;
1356         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1357         unsigned func = num_channels == 3 ? 4 : num_channels;
1358         const char *indexing_kind = structurized ? "struct" : "raw";
1359         char name[256], type_name[8];
1360
1361         LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1362         ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1363
1364         if (use_format) {
1365                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1366                          indexing_kind, type_name);
1367         } else {
1368                 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1369                          indexing_kind, type_name);
1370         }
1371
1372         return ac_build_intrinsic(ctx, name, type, args, idx,
1373                                   ac_get_load_intr_attribs(can_speculate));
1374 }
1375
1376 LLVMValueRef
1377 ac_build_buffer_load(struct ac_llvm_context *ctx,
1378                      LLVMValueRef rsrc,
1379                      int num_channels,
1380                      LLVMValueRef vindex,
1381                      LLVMValueRef voffset,
1382                      LLVMValueRef soffset,
1383                      unsigned inst_offset,
1384                      unsigned glc,
1385                      unsigned slc,
1386                      bool can_speculate,
1387                      bool allow_smem)
1388 {
1389         LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1390         if (voffset)
1391                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1392         if (soffset)
1393                 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1394
1395         if (allow_smem && !slc &&
1396             (!glc || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= VI))) {
1397                 assert(vindex == NULL);
1398
1399                 LLVMValueRef result[8];
1400
1401                 for (int i = 0; i < num_channels; i++) {
1402                         if (i) {
1403                                 offset = LLVMBuildAdd(ctx->builder, offset,
1404                                                       LLVMConstInt(ctx->i32, 4, 0), "");
1405                         }
1406                         const char *intrname =
1407                                 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
1408                                                     : "llvm.SI.load.const.v4i32";
1409                         unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
1410                         LLVMValueRef args[3] = {
1411                                 rsrc,
1412                                 offset,
1413                                 glc ? ctx->i32_1 : ctx->i32_0,
1414                         };
1415                         result[i] = ac_build_intrinsic(ctx, intrname,
1416                                                        ctx->f32, args, num_args,
1417                                                        AC_FUNC_ATTR_READNONE |
1418                                                        (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
1419                 }
1420                 if (num_channels == 1)
1421                         return result[0];
1422
1423                 if (num_channels == 3)
1424                         result[num_channels++] = LLVMGetUndef(ctx->f32);
1425                 return ac_build_gather_values(ctx, result, num_channels);
1426         }
1427
1428         if (HAVE_LLVM >= 0x0800) {
1429                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
1430                                                          offset, ctx->i32_0,
1431                                                          num_channels, ctx->f32,
1432                                                          glc, slc,
1433                                                          can_speculate, false,
1434                                                          false);
1435         }
1436
1437         return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
1438                                            num_channels, glc, slc,
1439                                            can_speculate, false);
1440 }
1441
1442 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1443                                          LLVMValueRef rsrc,
1444                                          LLVMValueRef vindex,
1445                                          LLVMValueRef voffset,
1446                                          unsigned num_channels,
1447                                          bool glc,
1448                                          bool can_speculate)
1449 {
1450         if (HAVE_LLVM >= 0x800) {
1451                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1452                                                          num_channels, ctx->f32,
1453                                                          glc, false,
1454                                                          can_speculate, true, true);
1455         }
1456         return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
1457                                            num_channels, glc, false,
1458                                            can_speculate, true);
1459 }
1460
1461 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1462                                                   LLVMValueRef rsrc,
1463                                                   LLVMValueRef vindex,
1464                                                   LLVMValueRef voffset,
1465                                                   unsigned num_channels,
1466                                                   bool glc,
1467                                                   bool can_speculate)
1468 {
1469         if (HAVE_LLVM >= 0x800) {
1470                 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1471                                                          num_channels, ctx->f32,
1472                                                          glc, false,
1473                                                          can_speculate, true, true);
1474         }
1475
1476         LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1477         LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
1478         stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1479
1480         LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1481                                                       LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1482                                                       elem_count, stride, "");
1483
1484         LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1485                                                        LLVMConstInt(ctx->i32, 2, 0), "");
1486
1487         return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1488                                            num_channels, glc, false,
1489                                            can_speculate, true);
1490 }
1491
1492 static LLVMValueRef
1493 ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
1494                             LLVMValueRef rsrc,
1495                             LLVMValueRef vindex,
1496                             LLVMValueRef voffset,
1497                             LLVMValueRef soffset,
1498                             unsigned num_channels,
1499                             unsigned dfmt,
1500                             unsigned nfmt,
1501                             bool glc,
1502                             bool slc,
1503                             bool can_speculate,
1504                             bool structurized)
1505 {
1506         LLVMValueRef args[6];
1507         int idx = 0;
1508         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1509         if (structurized)
1510                 args[idx++] = vindex ? vindex : ctx->i32_0;
1511         args[idx++] = voffset ? voffset : ctx->i32_0;
1512         args[idx++] = soffset ? soffset : ctx->i32_0;
1513         args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
1514         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1515         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1516
1517         LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1518         const char *type_names[] = {"i32", "v2i32", "v4i32"};
1519         const char *indexing_kind = structurized ? "struct" : "raw";
1520         char name[256];
1521
1522         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1523                  indexing_kind, type_names[func]);
1524
1525         return ac_build_intrinsic(ctx, name, types[func], args,
1526                                   idx,
1527                                   ac_get_load_intr_attribs(can_speculate));
1528 }
1529
1530 static LLVMValueRef
1531 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1532                             LLVMValueRef rsrc,
1533                             LLVMValueRef vindex,
1534                             LLVMValueRef voffset,
1535                             LLVMValueRef soffset,
1536                             LLVMValueRef immoffset,
1537                             unsigned num_channels,
1538                             unsigned dfmt,
1539                             unsigned nfmt,
1540                             bool glc,
1541                             bool slc,
1542                             bool can_speculate,
1543                             bool structurized) /* only matters for LLVM 8+ */
1544 {
1545         if (HAVE_LLVM >= 0x800) {
1546                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1547
1548                 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
1549                                                    soffset, num_channels,
1550                                                    dfmt, nfmt, glc, slc,
1551                                                    can_speculate, structurized);
1552         }
1553
1554         LLVMValueRef args[] = {
1555                 rsrc,
1556                 vindex ? vindex : ctx->i32_0,
1557                 voffset,
1558                 soffset,
1559                 immoffset,
1560                 LLVMConstInt(ctx->i32, dfmt, false),
1561                 LLVMConstInt(ctx->i32, nfmt, false),
1562                 LLVMConstInt(ctx->i1, glc, false),
1563                 LLVMConstInt(ctx->i1, slc, false),
1564         };
1565         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1566         LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1567         const char *type_names[] = {"i32", "v2i32", "v4i32"};
1568         char name[256];
1569
1570         snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
1571                  type_names[func]);
1572
1573         return ac_build_intrinsic(ctx, name, types[func], args, 9,
1574                                   ac_get_load_intr_attribs(can_speculate));
1575 }
1576
1577 LLVMValueRef
1578 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1579                              LLVMValueRef rsrc,
1580                              LLVMValueRef vindex,
1581                              LLVMValueRef voffset,
1582                              LLVMValueRef soffset,
1583                              LLVMValueRef immoffset,
1584                              unsigned num_channels,
1585                              unsigned dfmt,
1586                              unsigned nfmt,
1587                              bool glc,
1588                              bool slc,
1589                              bool can_speculate)
1590 {
1591         return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1592                                      immoffset, num_channels, dfmt, nfmt, glc,
1593                                      slc, can_speculate, true);
1594 }
1595
1596 LLVMValueRef
1597 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1598                           LLVMValueRef rsrc,
1599                           LLVMValueRef voffset,
1600                           LLVMValueRef soffset,
1601                           LLVMValueRef immoffset,
1602                           unsigned num_channels,
1603                           unsigned dfmt,
1604                           unsigned nfmt,
1605                           bool glc,
1606                           bool slc,
1607                           bool can_speculate)
1608 {
1609         return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1610                                      immoffset, num_channels, dfmt, nfmt, glc,
1611                                      slc, can_speculate, false);
1612 }
1613
1614 LLVMValueRef
1615 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1616                             LLVMValueRef rsrc,
1617                             LLVMValueRef voffset,
1618                             LLVMValueRef soffset,
1619                             LLVMValueRef immoffset,
1620                             bool glc)
1621 {
1622         LLVMValueRef res;
1623
1624         if (HAVE_LLVM >= 0x900) {
1625                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1626
1627                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1628                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1629                                                         voffset, soffset,
1630                                                         1, ctx->i16, glc, false,
1631                                                         false, false, false);
1632         } else {
1633                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1634                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1635
1636                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1637                                                 immoffset, 1, dfmt, nfmt, glc, false,
1638                                                 false);
1639
1640                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1641         }
1642
1643         return res;
1644 }
1645
1646 LLVMValueRef
1647 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1648                            LLVMValueRef rsrc,
1649                            LLVMValueRef voffset,
1650                            LLVMValueRef soffset,
1651                            LLVMValueRef immoffset,
1652                            bool glc)
1653 {
1654         LLVMValueRef res;
1655
1656         if (HAVE_LLVM >= 0x900) {
1657                 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1658
1659                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1660                 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1661                                                         voffset, soffset,
1662                                                         1, ctx->i8, glc, false,
1663                                                         false, false, false);
1664         } else {
1665                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1666                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1667
1668                 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1669                                                 immoffset, 1, dfmt, nfmt, glc, false,
1670                                                 false);
1671
1672                 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1673         }
1674
1675         return res;
1676 }
1677 static void
1678 ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
1679                              LLVMValueRef rsrc,
1680                              LLVMValueRef vdata,
1681                              LLVMValueRef vindex,
1682                              LLVMValueRef voffset,
1683                              LLVMValueRef soffset,
1684                              unsigned num_channels,
1685                              unsigned dfmt,
1686                              unsigned nfmt,
1687                              bool glc,
1688                              bool slc,
1689                              bool writeonly_memory,
1690                              bool structurized)
1691 {
1692         LLVMValueRef args[7];
1693         int idx = 0;
1694         args[idx++] = vdata;
1695         args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1696         if (structurized)
1697                 args[idx++] = vindex ? vindex : ctx->i32_0;
1698         args[idx++] = voffset ? voffset : ctx->i32_0;
1699         args[idx++] = soffset ? soffset : ctx->i32_0;
1700         args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
1701         args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1702         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1703
1704         const char *type_names[] = {"i32", "v2i32", "v4i32"};
1705         const char *indexing_kind = structurized ? "struct" : "raw";
1706         char name[256];
1707
1708         snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
1709                  indexing_kind, type_names[func]);
1710
1711         ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1712                            ac_get_store_intr_attribs(writeonly_memory));
1713 }
1714
1715 static void
1716 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
1717                        LLVMValueRef rsrc,
1718                        LLVMValueRef vdata,
1719                        LLVMValueRef vindex,
1720                        LLVMValueRef voffset,
1721                        LLVMValueRef soffset,
1722                        LLVMValueRef immoffset,
1723                        unsigned num_channels,
1724                        unsigned dfmt,
1725                        unsigned nfmt,
1726                        bool glc,
1727                        bool slc,
1728                        bool writeonly_memory,
1729                        bool structurized) /* only matters for LLVM 8+ */
1730 {
1731         if (HAVE_LLVM >= 0x800) {
1732                 voffset = LLVMBuildAdd(ctx->builder,
1733                                        voffset ? voffset : ctx->i32_0,
1734                                        immoffset, "");
1735
1736                 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
1737                                              soffset, num_channels, dfmt, nfmt,
1738                                              glc, slc, writeonly_memory,
1739                                              structurized);
1740         } else {
1741                 LLVMValueRef params[] = {
1742                         vdata,
1743                         rsrc,
1744                         vindex ? vindex : ctx->i32_0,
1745                         voffset ? voffset : ctx->i32_0,
1746                         soffset ? soffset : ctx->i32_0,
1747                         immoffset,
1748                         LLVMConstInt(ctx->i32, dfmt, false),
1749                         LLVMConstInt(ctx->i32, nfmt, false),
1750                         LLVMConstInt(ctx->i1, glc, false),
1751                         LLVMConstInt(ctx->i1, slc, false),
1752                 };
1753                 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1754                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
1755                 char name[256];
1756
1757                 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
1758                          type_names[func]);
1759
1760                 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
1761                                    ac_get_store_intr_attribs(writeonly_memory));
1762         }
1763 }
1764
1765 void
1766 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
1767                               LLVMValueRef rsrc,
1768                               LLVMValueRef vdata,
1769                               LLVMValueRef vindex,
1770                               LLVMValueRef voffset,
1771                               LLVMValueRef soffset,
1772                               LLVMValueRef immoffset,
1773                               unsigned num_channels,
1774                               unsigned dfmt,
1775                               unsigned nfmt,
1776                               bool glc,
1777                               bool slc,
1778                               bool writeonly_memory)
1779 {
1780         ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
1781                                immoffset, num_channels, dfmt, nfmt, glc, slc,
1782                                writeonly_memory, true);
1783 }
1784
1785 void
1786 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
1787                            LLVMValueRef rsrc,
1788                            LLVMValueRef vdata,
1789                            LLVMValueRef voffset,
1790                            LLVMValueRef soffset,
1791                            LLVMValueRef immoffset,
1792                            unsigned num_channels,
1793                            unsigned dfmt,
1794                            unsigned nfmt,
1795                            bool glc,
1796                            bool slc,
1797                            bool writeonly_memory)
1798 {
1799         ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
1800                                immoffset, num_channels, dfmt, nfmt, glc, slc,
1801                                writeonly_memory, false);
1802 }
1803
1804 void
1805 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
1806                              LLVMValueRef rsrc,
1807                              LLVMValueRef vdata,
1808                              LLVMValueRef voffset,
1809                              LLVMValueRef soffset,
1810                              bool glc,
1811                              bool writeonly_memory)
1812 {
1813         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1814
1815         if (HAVE_LLVM >= 0x900) {
1816                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1817                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
1818                                                    voffset, soffset, 1,
1819                                                    ctx->i16, glc, false,
1820                                                    writeonly_memory, false,
1821                                                    false);
1822         } else {
1823                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1824                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1825
1826                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1827
1828                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1829                                            ctx->i32_0, 1, dfmt, nfmt, glc, false,
1830                                            writeonly_memory);
1831         }
1832 }
1833
1834 void
1835 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
1836                             LLVMValueRef rsrc,
1837                             LLVMValueRef vdata,
1838                             LLVMValueRef voffset,
1839                             LLVMValueRef soffset,
1840                             bool glc,
1841                             bool writeonly_memory)
1842 {
1843         vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1844
1845         if (HAVE_LLVM >= 0x900) {
1846                 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1847                 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
1848                                                    voffset, soffset, 1,
1849                                                    ctx->i8, glc, false,
1850                                                    writeonly_memory, false,
1851                                                    false);
1852         } else {
1853                 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1854                 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1855
1856                 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1857
1858                 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1859                                            ctx->i32_0, 1, dfmt, nfmt, glc, false,
1860                                            writeonly_memory);
1861         }
1862 }
1863 /**
1864  * Set range metadata on an instruction.  This can only be used on load and
1865  * call instructions.  If you know an instruction can only produce the values
1866  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1867  * \p lo is the minimum value inclusive.
1868  * \p hi is the maximum value exclusive.
1869  */
1870 static void set_range_metadata(struct ac_llvm_context *ctx,
1871                                LLVMValueRef value, unsigned lo, unsigned hi)
1872 {
1873         LLVMValueRef range_md, md_args[2];
1874         LLVMTypeRef type = LLVMTypeOf(value);
1875         LLVMContextRef context = LLVMGetTypeContext(type);
1876
1877         md_args[0] = LLVMConstInt(type, lo, false);
1878         md_args[1] = LLVMConstInt(type, hi, false);
1879         range_md = LLVMMDNodeInContext(context, md_args, 2);
1880         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1881 }
1882
1883 LLVMValueRef
1884 ac_get_thread_id(struct ac_llvm_context *ctx)
1885 {
1886         LLVMValueRef tid;
1887
1888         LLVMValueRef tid_args[2];
1889         tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1890         tid_args[1] = ctx->i32_0;
1891         tid_args[1] = ac_build_intrinsic(ctx,
1892                                          "llvm.amdgcn.mbcnt.lo", ctx->i32,
1893                                          tid_args, 2, AC_FUNC_ATTR_READNONE);
1894
1895         tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
1896                                  ctx->i32, tid_args,
1897                                  2, AC_FUNC_ATTR_READNONE);
1898         set_range_metadata(ctx, tid, 0, 64);
1899         return tid;
1900 }
1901
1902 /*
1903  * SI implements derivatives using the local data store (LDS)
1904  * All writes to the LDS happen in all executing threads at
1905  * the same time. TID is the Thread ID for the current
1906  * thread and is a value between 0 and 63, representing
1907  * the thread's position in the wavefront.
1908  *
1909  * For the pixel shader threads are grouped into quads of four pixels.
1910  * The TIDs of the pixels of a quad are:
1911  *
1912  *  +------+------+
1913  *  |4n + 0|4n + 1|
1914  *  +------+------+
1915  *  |4n + 2|4n + 3|
1916  *  +------+------+
1917  *
1918  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1919  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1920  * the current pixel's column, and masking with 0xfffffffe yields the TID
1921  * of the left pixel of the current pixel's row.
1922  *
1923  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1924  * adding 2 yields the TID of the pixel below the top pixel.
1925  */
1926 LLVMValueRef
1927 ac_build_ddxy(struct ac_llvm_context *ctx,
1928               uint32_t mask,
1929               int idx,
1930               LLVMValueRef val)
1931 {
1932         unsigned tl_lanes[4], trbl_lanes[4];
1933         char name[32], type[8];
1934         LLVMValueRef tl, trbl;
1935         LLVMTypeRef result_type;
1936         LLVMValueRef result;
1937
1938         result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1939
1940         if (result_type == ctx->f16)
1941                 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1942
1943         for (unsigned i = 0; i < 4; ++i) {
1944                 tl_lanes[i] = i & mask;
1945                 trbl_lanes[i] = (i & mask) + idx;
1946         }
1947
1948         tl = ac_build_quad_swizzle(ctx, val,
1949                                    tl_lanes[0], tl_lanes[1],
1950                                    tl_lanes[2], tl_lanes[3]);
1951         trbl = ac_build_quad_swizzle(ctx, val,
1952                                      trbl_lanes[0], trbl_lanes[1],
1953                                      trbl_lanes[2], trbl_lanes[3]);
1954
1955         if (result_type == ctx->f16) {
1956                 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1957                 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1958         }
1959
1960         tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1961         trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1962         result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1963
1964         ac_build_type_name_for_intr(result_type, type, sizeof(type));
1965         snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1966
1967         return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1968 }
1969
1970 void
1971 ac_build_sendmsg(struct ac_llvm_context *ctx,
1972                  uint32_t msg,
1973                  LLVMValueRef wave_id)
1974 {
1975         LLVMValueRef args[2];
1976         args[0] = LLVMConstInt(ctx->i32, msg, false);
1977         args[1] = wave_id;
1978         ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1979 }
1980
1981 LLVMValueRef
1982 ac_build_imsb(struct ac_llvm_context *ctx,
1983               LLVMValueRef arg,
1984               LLVMTypeRef dst_type)
1985 {
1986         LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
1987                                               dst_type, &arg, 1,
1988                                               AC_FUNC_ATTR_READNONE);
1989
1990         /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1991          * the index from LSB. Invert it by doing "31 - msb". */
1992         msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
1993                            msb, "");
1994
1995         LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1996         LLVMValueRef cond = LLVMBuildOr(ctx->builder,
1997                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
1998                                                       arg, ctx->i32_0, ""),
1999                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2000                                                       arg, all_ones, ""), "");
2001
2002         return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
2003 }
2004
2005 LLVMValueRef
2006 ac_build_umsb(struct ac_llvm_context *ctx,
2007               LLVMValueRef arg,
2008               LLVMTypeRef dst_type)
2009 {
2010         const char *intrin_name;
2011         LLVMTypeRef type;
2012         LLVMValueRef highest_bit;
2013         LLVMValueRef zero;
2014         unsigned bitsize;
2015
2016         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
2017         switch (bitsize) {
2018         case 64:
2019                 intrin_name = "llvm.ctlz.i64";
2020                 type = ctx->i64;
2021                 highest_bit = LLVMConstInt(ctx->i64, 63, false);
2022                 zero = ctx->i64_0;
2023                 break;
2024         case 32:
2025                 intrin_name = "llvm.ctlz.i32";
2026                 type = ctx->i32;
2027                 highest_bit = LLVMConstInt(ctx->i32, 31, false);
2028                 zero = ctx->i32_0;
2029                 break;
2030         case 16:
2031                 intrin_name = "llvm.ctlz.i16";
2032                 type = ctx->i16;
2033                 highest_bit = LLVMConstInt(ctx->i16, 15, false);
2034                 zero = ctx->i16_0;
2035                 break;
2036         case 8:
2037                 intrin_name = "llvm.ctlz.i8";
2038                 type = ctx->i8;
2039                 highest_bit = LLVMConstInt(ctx->i8, 7, false);
2040                 zero = ctx->i8_0;
2041                 break;
2042         default:
2043                 unreachable(!"invalid bitsize");
2044                 break;
2045         }
2046
2047         LLVMValueRef params[2] = {
2048                 arg,
2049                 ctx->i1true,
2050         };
2051
2052         LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
2053                                               params, 2,
2054                                               AC_FUNC_ATTR_READNONE);
2055
2056         /* The HW returns the last bit index from MSB, but TGSI/NIR wants
2057          * the index from LSB. Invert it by doing "31 - msb". */
2058         msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
2059
2060         if (bitsize == 64) {
2061                 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
2062         } else if (bitsize < 32) {
2063                 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
2064         }
2065
2066         /* check for zero */
2067         return LLVMBuildSelect(ctx->builder,
2068                                LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
2069                                LLVMConstInt(ctx->i32, -1, true), msb, "");
2070 }
2071
2072 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
2073                            LLVMValueRef b)
2074 {
2075         char name[64];
2076         snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2077         LLVMValueRef args[2] = {a, b};
2078         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2079                                   AC_FUNC_ATTR_READNONE);
2080 }
2081
2082 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
2083                            LLVMValueRef b)
2084 {
2085         char name[64];
2086         snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2087         LLVMValueRef args[2] = {a, b};
2088         return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2089                                   AC_FUNC_ATTR_READNONE);
2090 }
2091
2092 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2093                            LLVMValueRef b)
2094 {
2095         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2096         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2097 }
2098
2099 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2100                            LLVMValueRef b)
2101 {
2102         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2103         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2104 }
2105
2106 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2107                            LLVMValueRef b)
2108 {
2109         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2110         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2111 }
2112
2113 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
2114                            LLVMValueRef b)
2115 {
2116         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
2117         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2118 }
2119
2120 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2121 {
2122         LLVMTypeRef t = LLVMTypeOf(value);
2123         return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2124                              LLVMConstReal(t, 1.0));
2125 }
2126
2127 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2128 {
2129         LLVMValueRef args[9];
2130
2131         args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2132         args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2133
2134         if (a->compr) {
2135                 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2136                 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2137
2138                 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2139                                 v2i16, "");
2140                 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2141                                 v2i16, "");
2142                 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2143                 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2144
2145                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2146                                    ctx->voidt, args, 6, 0);
2147         } else {
2148                 args[2] = a->out[0];
2149                 args[3] = a->out[1];
2150                 args[4] = a->out[2];
2151                 args[5] = a->out[3];
2152                 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2153                 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2154
2155                 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2156                                    ctx->voidt, args, 8, 0);
2157         }
2158 }
2159
2160 void ac_build_export_null(struct ac_llvm_context *ctx)
2161 {
2162         struct ac_export_args args;
2163
2164         args.enabled_channels = 0x0; /* enabled channels */
2165         args.valid_mask = 1; /* whether the EXEC mask is valid */
2166         args.done = 1; /* DONE bit */
2167         args.target = V_008DFC_SQ_EXP_NULL;
2168         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2169         args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2170         args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2171         args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2172         args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2173
2174         ac_build_export(ctx, &args);
2175 }
2176
2177 static unsigned ac_num_coords(enum ac_image_dim dim)
2178 {
2179         switch (dim) {
2180         case ac_image_1d:
2181                 return 1;
2182         case ac_image_2d:
2183         case ac_image_1darray:
2184                  return 2;
2185         case ac_image_3d:
2186         case ac_image_cube:
2187         case ac_image_2darray:
2188         case ac_image_2dmsaa:
2189                 return 3;
2190         case ac_image_2darraymsaa:
2191                 return 4;
2192         default:
2193                 unreachable("ac_num_coords: bad dim");
2194         }
2195 }
2196
2197 static unsigned ac_num_derivs(enum ac_image_dim dim)
2198 {
2199         switch (dim) {
2200         case ac_image_1d:
2201         case ac_image_1darray:
2202                 return 2;
2203         case ac_image_2d:
2204         case ac_image_2darray:
2205         case ac_image_cube:
2206                 return 4;
2207         case ac_image_3d:
2208                 return 6;
2209         case ac_image_2dmsaa:
2210         case ac_image_2darraymsaa:
2211         default:
2212                 unreachable("derivatives not supported");
2213         }
2214 }
2215
2216 static const char *get_atomic_name(enum ac_atomic_op op)
2217 {
2218         switch (op) {
2219         case ac_atomic_swap: return "swap";
2220         case ac_atomic_add: return "add";
2221         case ac_atomic_sub: return "sub";
2222         case ac_atomic_smin: return "smin";
2223         case ac_atomic_umin: return "umin";
2224         case ac_atomic_smax: return "smax";
2225         case ac_atomic_umax: return "umax";
2226         case ac_atomic_and: return "and";
2227         case ac_atomic_or: return "or";
2228         case ac_atomic_xor: return "xor";
2229         }
2230         unreachable("bad atomic op");
2231 }
2232
2233 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2234                                    struct ac_image_args *a)
2235 {
2236         const char *overload[3] = { "", "", "" };
2237         unsigned num_overloads = 0;
2238         LLVMValueRef args[18];
2239         unsigned num_args = 0;
2240         enum ac_image_dim dim = a->dim;
2241
2242         assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2243                !a->level_zero);
2244         assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2245                 a->opcode != ac_image_store_mip) ||
2246                a->lod);
2247         assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2248                (!a->compare && !a->offset));
2249         assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2250                 a->opcode == ac_image_get_lod) ||
2251                !a->bias);
2252         assert((a->bias ? 1 : 0) +
2253                (a->lod ? 1 : 0) +
2254                (a->level_zero ? 1 : 0) +
2255                (a->derivs[0] ? 1 : 0) <= 1);
2256
2257         if (a->opcode == ac_image_get_lod) {
2258                 switch (dim) {
2259                 case ac_image_1darray:
2260                         dim = ac_image_1d;
2261                         break;
2262                 case ac_image_2darray:
2263                 case ac_image_cube:
2264                         dim = ac_image_2d;
2265                         break;
2266                 default:
2267                         break;
2268                 }
2269         }
2270
2271         bool sample = a->opcode == ac_image_sample ||
2272                       a->opcode == ac_image_gather4 ||
2273                       a->opcode == ac_image_get_lod;
2274         bool atomic = a->opcode == ac_image_atomic ||
2275                       a->opcode == ac_image_atomic_cmpswap;
2276         LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2277
2278         if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2279                 args[num_args++] = a->data[0];
2280                 if (a->opcode == ac_image_atomic_cmpswap)
2281                         args[num_args++] = a->data[1];
2282         }
2283
2284         if (!atomic)
2285                 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2286
2287         if (a->offset)
2288                 args[num_args++] = ac_to_integer(ctx, a->offset);
2289         if (a->bias) {
2290                 args[num_args++] = ac_to_float(ctx, a->bias);
2291                 overload[num_overloads++] = ".f32";
2292         }
2293         if (a->compare)
2294                 args[num_args++] = ac_to_float(ctx, a->compare);
2295         if (a->derivs[0]) {
2296                 unsigned count = ac_num_derivs(dim);
2297                 for (unsigned i = 0; i < count; ++i)
2298                         args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2299                 overload[num_overloads++] = ".f32";
2300         }
2301         unsigned num_coords =
2302                 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2303         for (unsigned i = 0; i < num_coords; ++i)
2304                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2305         if (a->lod)
2306                 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2307         overload[num_overloads++] = sample ? ".f32" : ".i32";
2308
2309         args[num_args++] = a->resource;
2310         if (sample) {
2311                 args[num_args++] = a->sampler;
2312                 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2313         }
2314
2315         args[num_args++] = ctx->i32_0; /* texfailctrl */
2316         args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
2317
2318         const char *name;
2319         const char *atomic_subop = "";
2320         switch (a->opcode) {
2321         case ac_image_sample: name = "sample"; break;
2322         case ac_image_gather4: name = "gather4"; break;
2323         case ac_image_load: name = "load"; break;
2324         case ac_image_load_mip: name = "load.mip"; break;
2325         case ac_image_store: name = "store"; break;
2326         case ac_image_store_mip: name = "store.mip"; break;
2327         case ac_image_atomic:
2328                 name = "atomic.";
2329                 atomic_subop = get_atomic_name(a->atomic);
2330                 break;
2331         case ac_image_atomic_cmpswap:
2332                 name = "atomic.";
2333                 atomic_subop = "cmpswap";
2334                 break;
2335         case ac_image_get_lod: name = "getlod"; break;
2336         case ac_image_get_resinfo: name = "getresinfo"; break;
2337         default: unreachable("invalid image opcode");
2338         }
2339
2340         const char *dimname;
2341         switch (dim) {
2342         case ac_image_1d: dimname = "1d"; break;
2343         case ac_image_2d: dimname = "2d"; break;
2344         case ac_image_3d: dimname = "3d"; break;
2345         case ac_image_cube: dimname = "cube"; break;
2346         case ac_image_1darray: dimname = "1darray"; break;
2347         case ac_image_2darray: dimname = "2darray"; break;
2348         case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2349         case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2350         default: unreachable("invalid dim");
2351         }
2352
2353         bool lod_suffix =
2354                 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2355         char intr_name[96];
2356         snprintf(intr_name, sizeof(intr_name),
2357                  "llvm.amdgcn.image.%s%s" /* base name */
2358                  "%s%s%s" /* sample/gather modifiers */
2359                  ".%s.%s%s%s%s", /* dimension and type overloads */
2360                  name, atomic_subop,
2361                  a->compare ? ".c" : "",
2362                  a->bias ? ".b" :
2363                  lod_suffix ? ".l" :
2364                  a->derivs[0] ? ".d" :
2365                  a->level_zero ? ".lz" : "",
2366                  a->offset ? ".o" : "",
2367                  dimname,
2368                  atomic ? "i32" : "v4f32",
2369                  overload[0], overload[1], overload[2]);
2370
2371         LLVMTypeRef retty;
2372         if (atomic)
2373                 retty = ctx->i32;
2374         else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2375                 retty = ctx->voidt;
2376         else
2377                 retty = ctx->v4f32;
2378
2379         LLVMValueRef result =
2380                 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2381                                    a->attributes);
2382         if (!sample && retty == ctx->v4f32) {
2383                 result = LLVMBuildBitCast(ctx->builder, result,
2384                                           ctx->v4i32, "");
2385         }
2386         return result;
2387 }
2388
2389 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2390                                     LLVMValueRef args[2])
2391 {
2392         LLVMTypeRef v2f16 =
2393                 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2394
2395         return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2396                                   args, 2, AC_FUNC_ATTR_READNONE);
2397 }
2398
2399 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2400                                      LLVMValueRef args[2])
2401 {
2402         LLVMValueRef res =
2403                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2404                                    ctx->v2i16, args, 2,
2405                                    AC_FUNC_ATTR_READNONE);
2406         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2407 }
2408
2409 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2410                                      LLVMValueRef args[2])
2411 {
2412         LLVMValueRef res =
2413                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2414                                    ctx->v2i16, args, 2,
2415                                    AC_FUNC_ATTR_READNONE);
2416         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2417 }
2418
2419 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2420 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2421                                  LLVMValueRef args[2], unsigned bits, bool hi)
2422 {
2423         assert(bits == 8 || bits == 10 || bits == 16);
2424
2425         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2426                 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2427         LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2428                 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2429         LLVMValueRef max_alpha =
2430                 bits != 10 ? max_rgb : ctx->i32_1;
2431         LLVMValueRef min_alpha =
2432                 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2433
2434         /* Clamp. */
2435         if (bits != 16) {
2436                 for (int i = 0; i < 2; i++) {
2437                         bool alpha = hi && i == 1;
2438                         args[i] = ac_build_imin(ctx, args[i],
2439                                                 alpha ? max_alpha : max_rgb);
2440                         args[i] = ac_build_imax(ctx, args[i],
2441                                                 alpha ? min_alpha : min_rgb);
2442                 }
2443         }
2444
2445         LLVMValueRef res =
2446                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2447                                    ctx->v2i16, args, 2,
2448                                    AC_FUNC_ATTR_READNONE);
2449         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2450 }
2451
2452 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2453 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2454                                  LLVMValueRef args[2], unsigned bits, bool hi)
2455 {
2456         assert(bits == 8 || bits == 10 || bits == 16);
2457
2458         LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2459                 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2460         LLVMValueRef max_alpha =
2461                 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2462
2463         /* Clamp. */
2464         if (bits != 16) {
2465                 for (int i = 0; i < 2; i++) {
2466                         bool alpha = hi && i == 1;
2467                         args[i] = ac_build_umin(ctx, args[i],
2468                                                 alpha ? max_alpha : max_rgb);
2469                 }
2470         }
2471
2472         LLVMValueRef res =
2473                 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2474                                    ctx->v2i16, args, 2,
2475                                    AC_FUNC_ATTR_READNONE);
2476         return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2477 }
2478
2479 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2480 {
2481         return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2482                                   &i1, 1, AC_FUNC_ATTR_READNONE);
2483 }
2484
2485 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2486 {
2487         ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2488                            &i1, 1, 0);
2489 }
2490
2491 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2492                           LLVMValueRef offset, LLVMValueRef width,
2493                           bool is_signed)
2494 {
2495         LLVMValueRef args[] = {
2496                 input,
2497                 offset,
2498                 width,
2499         };
2500
2501         return ac_build_intrinsic(ctx,
2502                                   is_signed ? "llvm.amdgcn.sbfe.i32" :
2503                                               "llvm.amdgcn.ubfe.i32",
2504                                   ctx->i32, args, 3,
2505                                   AC_FUNC_ATTR_READNONE);
2506 }
2507
2508 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2509                            LLVMValueRef s1, LLVMValueRef s2)
2510 {
2511         return LLVMBuildAdd(ctx->builder,
2512                             LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2513 }
2514
2515 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2516                            LLVMValueRef s1, LLVMValueRef s2)
2517 {
2518         return LLVMBuildFAdd(ctx->builder,
2519                              LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2520 }
2521
2522 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
2523 {
2524         LLVMValueRef args[1] = {
2525                 LLVMConstInt(ctx->i32, simm16, false),
2526         };
2527         ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2528                            ctx->voidt, args, 1, 0);
2529 }
2530
2531 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
2532                             LLVMValueRef src1, LLVMValueRef src2,
2533                             unsigned bitsize)
2534 {
2535         LLVMTypeRef type;
2536         char *intr;
2537
2538         if (bitsize == 16) {
2539                 intr = "llvm.amdgcn.fmed3.f16";
2540                 type = ctx->f16;
2541         } else if (bitsize == 32) {
2542                 intr = "llvm.amdgcn.fmed3.f32";
2543                 type = ctx->f32;
2544         } else {
2545                 intr = "llvm.amdgcn.fmed3.f64";
2546                 type = ctx->f64;
2547         }
2548
2549         LLVMValueRef params[] = {
2550                 src0,
2551                 src1,
2552                 src2,
2553         };
2554         return ac_build_intrinsic(ctx, intr, type, params, 3,
2555                                   AC_FUNC_ATTR_READNONE);
2556 }
2557
2558 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2559                             unsigned bitsize)
2560 {
2561         LLVMTypeRef type;
2562         char *intr;
2563
2564         if (bitsize == 16) {
2565                 intr = "llvm.amdgcn.fract.f16";
2566                 type = ctx->f16;
2567         } else if (bitsize == 32) {
2568                 intr = "llvm.amdgcn.fract.f32";
2569                 type = ctx->f32;
2570         } else {
2571                 intr = "llvm.amdgcn.fract.f64";
2572                 type = ctx->f64;
2573         }
2574
2575         LLVMValueRef params[] = {
2576                 src0,
2577         };
2578         return ac_build_intrinsic(ctx, intr, type, params, 1,
2579                                   AC_FUNC_ATTR_READNONE);
2580 }
2581
2582 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2583                             unsigned bitsize)
2584 {
2585         LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2586         LLVMValueRef zero = LLVMConstInt(type, 0, false);
2587         LLVMValueRef one = LLVMConstInt(type, 1, false);
2588
2589         LLVMValueRef cmp, val;
2590         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2591         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2592         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2593         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2594         return val;
2595 }
2596
2597 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2598                             unsigned bitsize)
2599 {
2600         LLVMValueRef cmp, val, zero, one;
2601         LLVMTypeRef type;
2602
2603         if (bitsize == 16) {
2604                 type = ctx->f16;
2605                 zero = ctx->f16_0;
2606                 one = ctx->f16_1;
2607         } else if (bitsize == 32) {
2608                 type = ctx->f32;
2609                 zero = ctx->f32_0;
2610                 one = ctx->f32_1;
2611         } else {
2612                 type = ctx->f64;
2613                 zero = ctx->f64_0;
2614                 one = ctx->f64_1;
2615         }
2616
2617         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
2618         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2619         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
2620         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
2621         return val;
2622 }
2623
2624 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2625 {
2626         LLVMValueRef result;
2627         unsigned bitsize;
2628
2629         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2630
2631         switch (bitsize) {
2632         case 64:
2633                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
2634                                             (LLVMValueRef []) { src0 }, 1,
2635                                             AC_FUNC_ATTR_READNONE);
2636
2637                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2638                 break;
2639         case 32:
2640                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
2641                                             (LLVMValueRef []) { src0 }, 1,
2642                                             AC_FUNC_ATTR_READNONE);
2643                 break;
2644         case 16:
2645                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
2646                                             (LLVMValueRef []) { src0 }, 1,
2647                                             AC_FUNC_ATTR_READNONE);
2648
2649                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2650                 break;
2651         case 8:
2652                 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
2653                                             (LLVMValueRef []) { src0 }, 1,
2654                                             AC_FUNC_ATTR_READNONE);
2655
2656                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2657                 break;
2658         default:
2659                 unreachable(!"invalid bitsize");
2660                 break;
2661         }
2662
2663         return result;
2664 }
2665
2666 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
2667                                        LLVMValueRef src0)
2668 {
2669         LLVMValueRef result;
2670         unsigned bitsize;
2671
2672         bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2673
2674         switch (bitsize) {
2675         case 64:
2676                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
2677                                             (LLVMValueRef []) { src0 }, 1,
2678                                             AC_FUNC_ATTR_READNONE);
2679
2680                 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2681                 break;
2682         case 32:
2683                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
2684                                             (LLVMValueRef []) { src0 }, 1,
2685                                             AC_FUNC_ATTR_READNONE);
2686                 break;
2687         case 16:
2688                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
2689                                             (LLVMValueRef []) { src0 }, 1,
2690                                             AC_FUNC_ATTR_READNONE);
2691
2692                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2693                 break;
2694         case 8:
2695                 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
2696                                             (LLVMValueRef []) { src0 }, 1,
2697                                             AC_FUNC_ATTR_READNONE);
2698
2699                 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2700                 break;
2701         default:
2702                 unreachable(!"invalid bitsize");
2703                 break;
2704         }
2705
2706         return result;
2707 }
2708
2709 #define AC_EXP_TARGET           0
2710 #define AC_EXP_ENABLED_CHANNELS 1
2711 #define AC_EXP_OUT0             2
2712
2713 enum ac_ir_type {
2714         AC_IR_UNDEF,
2715         AC_IR_CONST,
2716         AC_IR_VALUE,
2717 };
2718
2719 struct ac_vs_exp_chan
2720 {
2721         LLVMValueRef value;
2722         float const_float;
2723         enum ac_ir_type type;
2724 };
2725
2726 struct ac_vs_exp_inst {
2727         unsigned offset;
2728         LLVMValueRef inst;
2729         struct ac_vs_exp_chan chan[4];
2730 };
2731
2732 struct ac_vs_exports {
2733         unsigned num;
2734         struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2735 };
2736
2737 /* Return true if the PARAM export has been eliminated. */
2738 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
2739                                       uint32_t num_outputs,
2740                                       struct ac_vs_exp_inst *exp)
2741 {
2742         unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2743         bool is_zero[4] = {}, is_one[4] = {};
2744
2745         for (i = 0; i < 4; i++) {
2746                 /* It's a constant expression. Undef outputs are eliminated too. */
2747                 if (exp->chan[i].type == AC_IR_UNDEF) {
2748                         is_zero[i] = true;
2749                         is_one[i] = true;
2750                 } else if (exp->chan[i].type == AC_IR_CONST) {
2751                         if (exp->chan[i].const_float == 0)
2752                                 is_zero[i] = true;
2753                         else if (exp->chan[i].const_float == 1)
2754                                 is_one[i] = true;
2755                         else
2756                                 return false; /* other constant */
2757                 } else
2758                         return false;
2759         }
2760
2761         /* Only certain combinations of 0 and 1 can be eliminated. */
2762         if (is_zero[0] && is_zero[1] && is_zero[2])
2763                 default_val = is_zero[3] ? 0 : 1;
2764         else if (is_one[0] && is_one[1] && is_one[2])
2765                 default_val = is_zero[3] ? 2 : 3;
2766         else
2767                 return false;
2768
2769         /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2770         LLVMInstructionEraseFromParent(exp->inst);
2771
2772         /* Change OFFSET to DEFAULT_VAL. */
2773         for (i = 0; i < num_outputs; i++) {
2774                 if (vs_output_param_offset[i] == exp->offset) {
2775                         vs_output_param_offset[i] =
2776                                 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2777                         break;
2778                 }
2779         }
2780         return true;
2781 }
2782
2783 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2784                                            uint8_t *vs_output_param_offset,
2785                                            uint32_t num_outputs,
2786                                            struct ac_vs_exports *processed,
2787                                            struct ac_vs_exp_inst *exp)
2788 {
2789         unsigned p, copy_back_channels = 0;
2790
2791         /* See if the output is already in the list of processed outputs.
2792          * The LLVMValueRef comparison relies on SSA.
2793          */
2794         for (p = 0; p < processed->num; p++) {
2795                 bool different = false;
2796
2797                 for (unsigned j = 0; j < 4; j++) {
2798                         struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2799                         struct ac_vs_exp_chan *c2 = &exp->chan[j];
2800
2801                         /* Treat undef as a match. */
2802                         if (c2->type == AC_IR_UNDEF)
2803                                 continue;
2804
2805                         /* If c1 is undef but c2 isn't, we can copy c2 to c1
2806                          * and consider the instruction duplicated.
2807                          */
2808                         if (c1->type == AC_IR_UNDEF) {
2809                                 copy_back_channels |= 1 << j;
2810                                 continue;
2811                         }
2812
2813                         /* Test whether the channels are not equal. */
2814                         if (c1->type != c2->type ||
2815                             (c1->type == AC_IR_CONST &&
2816                              c1->const_float != c2->const_float) ||
2817                             (c1->type == AC_IR_VALUE &&
2818                              c1->value != c2->value)) {
2819                                 different = true;
2820                                 break;
2821                         }
2822                 }
2823                 if (!different)
2824                         break;
2825
2826                 copy_back_channels = 0;
2827         }
2828         if (p == processed->num)
2829                 return false;
2830
2831         /* If a match was found, but the matching export has undef where the new
2832          * one has a normal value, copy the normal value to the undef channel.
2833          */
2834         struct ac_vs_exp_inst *match = &processed->exp[p];
2835
2836         /* Get current enabled channels mask. */
2837         LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2838         unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2839
2840         while (copy_back_channels) {
2841                 unsigned chan = u_bit_scan(&copy_back_channels);
2842
2843                 assert(match->chan[chan].type == AC_IR_UNDEF);
2844                 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
2845                                exp->chan[chan].value);
2846                 match->chan[chan] = exp->chan[chan];
2847
2848                 /* Update number of enabled channels because the original mask
2849                  * is not always 0xf.
2850                  */
2851                 enabled_channels |= (1 << chan);
2852                 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2853                                LLVMConstInt(ctx->i32, enabled_channels, 0));
2854         }
2855
2856         /* The PARAM export is duplicated. Kill it. */
2857         LLVMInstructionEraseFromParent(exp->inst);
2858
2859         /* Change OFFSET to the matching export. */
2860         for (unsigned i = 0; i < num_outputs; i++) {
2861                 if (vs_output_param_offset[i] == exp->offset) {
2862                         vs_output_param_offset[i] = match->offset;
2863                         break;
2864                 }
2865         }
2866         return true;
2867 }
2868
2869 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
2870                             LLVMValueRef main_fn,
2871                             uint8_t *vs_output_param_offset,
2872                             uint32_t num_outputs,
2873                             uint8_t *num_param_exports)
2874 {
2875         LLVMBasicBlockRef bb;
2876         bool removed_any = false;
2877         struct ac_vs_exports exports;
2878
2879         exports.num = 0;
2880
2881         /* Process all LLVM instructions. */
2882         bb = LLVMGetFirstBasicBlock(main_fn);
2883         while (bb) {
2884                 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2885
2886                 while (inst) {
2887                         LLVMValueRef cur = inst;
2888                         inst = LLVMGetNextInstruction(inst);
2889                         struct ac_vs_exp_inst exp;
2890
2891                         if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2892                                 continue;
2893
2894                         LLVMValueRef callee = ac_llvm_get_called_value(cur);
2895
2896                         if (!ac_llvm_is_function(callee))
2897                                 continue;
2898
2899                         const char *name = LLVMGetValueName(callee);
2900                         unsigned num_args = LLVMCountParams(callee);
2901
2902                         /* Check if this is an export instruction. */
2903                         if ((num_args != 9 && num_args != 8) ||
2904                             (strcmp(name, "llvm.SI.export") &&
2905                              strcmp(name, "llvm.amdgcn.exp.f32")))
2906                                 continue;
2907
2908                         LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2909                         unsigned target = LLVMConstIntGetZExtValue(arg);
2910
2911                         if (target < V_008DFC_SQ_EXP_PARAM)
2912                                 continue;
2913
2914                         target -= V_008DFC_SQ_EXP_PARAM;
2915
2916                         /* Parse the instruction. */
2917                         memset(&exp, 0, sizeof(exp));
2918                         exp.offset = target;
2919                         exp.inst = cur;
2920
2921                         for (unsigned i = 0; i < 4; i++) {
2922                                 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2923
2924                                 exp.chan[i].value = v;
2925
2926                                 if (LLVMIsUndef(v)) {
2927                                         exp.chan[i].type = AC_IR_UNDEF;
2928                                 } else if (LLVMIsAConstantFP(v)) {
2929                                         LLVMBool loses_info;
2930                                         exp.chan[i].type = AC_IR_CONST;
2931                                         exp.chan[i].const_float =
2932                                                 LLVMConstRealGetDouble(v, &loses_info);
2933                                 } else {
2934                                         exp.chan[i].type = AC_IR_VALUE;
2935                                 }
2936                         }
2937
2938                         /* Eliminate constant and duplicated PARAM exports. */
2939                         if (ac_eliminate_const_output(vs_output_param_offset,
2940                                                       num_outputs, &exp) ||
2941                             ac_eliminate_duplicated_output(ctx,
2942                                                            vs_output_param_offset,
2943                                                            num_outputs, &exports,
2944                                                            &exp)) {
2945                                 removed_any = true;
2946                         } else {
2947                                 exports.exp[exports.num++] = exp;
2948                         }
2949                 }
2950                 bb = LLVMGetNextBasicBlock(bb);
2951         }
2952
2953         /* Remove holes in export memory due to removed PARAM exports.
2954          * This is done by renumbering all PARAM exports.
2955          */
2956         if (removed_any) {
2957                 uint8_t old_offset[VARYING_SLOT_MAX];
2958                 unsigned out, i;
2959
2960                 /* Make a copy of the offsets. We need the old version while
2961                  * we are modifying some of them. */
2962                 memcpy(old_offset, vs_output_param_offset,
2963                        sizeof(old_offset));
2964
2965                 for (i = 0; i < exports.num; i++) {
2966                         unsigned offset = exports.exp[i].offset;
2967
2968                         /* Update vs_output_param_offset. Multiple outputs can
2969                          * have the same offset.
2970                          */
2971                         for (out = 0; out < num_outputs; out++) {
2972                                 if (old_offset[out] == offset)
2973                                         vs_output_param_offset[out] = i;
2974                         }
2975
2976                         /* Change the PARAM offset in the instruction. */
2977                         LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2978                                        LLVMConstInt(ctx->i32,
2979                                                     V_008DFC_SQ_EXP_PARAM + i, 0));
2980                 }
2981                 *num_param_exports = exports.num;
2982         }
2983 }
2984
2985 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2986 {
2987         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2988         ac_build_intrinsic(ctx,
2989                            "llvm.amdgcn.init.exec", ctx->voidt,
2990                            &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
2991 }
2992
2993 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2994 {
2995         unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
2996         ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2997                                      LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
2998                                      "lds");
2999 }
3000
3001 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
3002                          LLVMValueRef dw_addr)
3003 {
3004         return ac_build_load(ctx, ctx->lds, dw_addr);
3005 }
3006
3007 void ac_lds_store(struct ac_llvm_context *ctx,
3008                   LLVMValueRef dw_addr,
3009                   LLVMValueRef value)
3010 {
3011         value = ac_to_integer(ctx, value);
3012         ac_build_indexed_store(ctx, ctx->lds,
3013                                dw_addr, value);
3014 }
3015
3016 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
3017                          LLVMTypeRef dst_type,
3018                          LLVMValueRef src0)
3019 {
3020         unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3021         const char *intrin_name;
3022         LLVMTypeRef type;
3023         LLVMValueRef zero;
3024
3025         switch (src0_bitsize) {
3026         case 64:
3027                 intrin_name = "llvm.cttz.i64";
3028                 type = ctx->i64;
3029                 zero = ctx->i64_0;
3030                 break;
3031         case 32:
3032                 intrin_name = "llvm.cttz.i32";
3033                 type = ctx->i32;
3034                 zero = ctx->i32_0;
3035                 break;
3036         case 16:
3037                 intrin_name = "llvm.cttz.i16";
3038                 type = ctx->i16;
3039                 zero = ctx->i16_0;
3040                 break;
3041         case 8:
3042                 intrin_name = "llvm.cttz.i8";
3043                 type = ctx->i8;
3044                 zero = ctx->i8_0;
3045                 break;
3046         default:
3047                 unreachable(!"invalid bitsize");
3048         }
3049
3050         LLVMValueRef params[2] = {
3051                 src0,
3052
3053                 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3054                  * add special code to check for x=0. The reason is that
3055                  * the LLVM behavior for x=0 is different from what we
3056                  * need here. However, LLVM also assumes that ffs(x) is
3057                  * in [0, 31], but GLSL expects that ffs(0) = -1, so
3058                  * a conditional assignment to handle 0 is still required.
3059                  *
3060                  * The hardware already implements the correct behavior.
3061                  */
3062                 ctx->i1true,
3063         };
3064
3065         LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
3066                                               params, 2,
3067                                               AC_FUNC_ATTR_READNONE);
3068
3069         if (src0_bitsize == 64) {
3070                 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3071         } else if (src0_bitsize < 32) {
3072                 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3073         }
3074
3075         /* TODO: We need an intrinsic to skip this conditional. */
3076         /* Check for zero: */
3077         return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
3078                                                            LLVMIntEQ, src0,
3079                                                            zero, ""),
3080                                LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3081 }
3082
3083 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3084 {
3085         return LLVMPointerType(LLVMArrayType(elem_type, 0),
3086                                AC_ADDR_SPACE_CONST);
3087 }
3088
3089 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3090 {
3091         return LLVMPointerType(LLVMArrayType(elem_type, 0),
3092                                AC_ADDR_SPACE_CONST_32BIT);
3093 }
3094
3095 static struct ac_llvm_flow *
3096 get_current_flow(struct ac_llvm_context *ctx)
3097 {
3098         if (ctx->flow_depth > 0)
3099                 return &ctx->flow[ctx->flow_depth - 1];
3100         return NULL;
3101 }
3102
3103 static struct ac_llvm_flow *
3104 get_innermost_loop(struct ac_llvm_context *ctx)
3105 {
3106         for (unsigned i = ctx->flow_depth; i > 0; --i) {
3107                 if (ctx->flow[i - 1].loop_entry_block)
3108                         return &ctx->flow[i - 1];
3109         }
3110         return NULL;
3111 }
3112
3113 static struct ac_llvm_flow *
3114 push_flow(struct ac_llvm_context *ctx)
3115 {
3116         struct ac_llvm_flow *flow;
3117
3118         if (ctx->flow_depth >= ctx->flow_depth_max) {
3119                 unsigned new_max = MAX2(ctx->flow_depth << 1,
3120                                         AC_LLVM_INITIAL_CF_DEPTH);
3121
3122                 ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
3123                 ctx->flow_depth_max = new_max;
3124         }
3125
3126         flow = &ctx->flow[ctx->flow_depth];
3127         ctx->flow_depth++;
3128
3129         flow->next_block = NULL;
3130         flow->loop_entry_block = NULL;
3131         return flow;
3132 }
3133
3134 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
3135                                 int label_id)
3136 {
3137         char buf[32];
3138         snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3139         LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3140 }
3141
3142 /* Append a basic block at the level of the parent flow.
3143  */
3144 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
3145                                             const char *name)
3146 {
3147         assert(ctx->flow_depth >= 1);
3148
3149         if (ctx->flow_depth >= 2) {
3150                 struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
3151
3152                 return LLVMInsertBasicBlockInContext(ctx->context,
3153                                                      flow->next_block, name);
3154         }
3155
3156         LLVMValueRef main_fn =
3157                 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3158         return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3159 }
3160
3161 /* Emit a branch to the given default target for the current block if
3162  * applicable -- that is, if the current block does not already contain a
3163  * branch from a break or continue.
3164  */
3165 static void emit_default_branch(LLVMBuilderRef builder,
3166                                 LLVMBasicBlockRef target)
3167 {
3168         if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3169                  LLVMBuildBr(builder, target);
3170 }
3171
3172 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3173 {
3174         struct ac_llvm_flow *flow = push_flow(ctx);
3175         flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3176         flow->next_block = append_basic_block(ctx, "ENDLOOP");
3177         set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3178         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3179         LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3180 }
3181
3182 void ac_build_break(struct ac_llvm_context *ctx)
3183 {
3184         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3185         LLVMBuildBr(ctx->builder, flow->next_block);
3186 }
3187
3188 void ac_build_continue(struct ac_llvm_context *ctx)
3189 {
3190         struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3191         LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3192 }
3193
3194 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3195 {
3196         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3197         LLVMBasicBlockRef endif_block;
3198
3199         assert(!current_branch->loop_entry_block);
3200
3201         endif_block = append_basic_block(ctx, "ENDIF");
3202         emit_default_branch(ctx->builder, endif_block);
3203
3204         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3205         set_basicblock_name(current_branch->next_block, "else", label_id);
3206
3207         current_branch->next_block = endif_block;
3208 }
3209
3210 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3211 {
3212         struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3213
3214         assert(!current_branch->loop_entry_block);
3215
3216         emit_default_branch(ctx->builder, current_branch->next_block);
3217         LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3218         set_basicblock_name(current_branch->next_block, "endif", label_id);
3219
3220         ctx->flow_depth--;
3221 }
3222
3223 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3224 {
3225         struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3226
3227         assert(current_loop->loop_entry_block);
3228
3229         emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3230
3231         LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3232         set_basicblock_name(current_loop->next_block, "endloop", label_id);
3233         ctx->flow_depth--;
3234 }
3235
3236 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3237 {
3238         struct ac_llvm_flow *flow = push_flow(ctx);
3239         LLVMBasicBlockRef if_block;
3240
3241         if_block = append_basic_block(ctx, "IF");
3242         flow->next_block = append_basic_block(ctx, "ELSE");
3243         set_basicblock_name(if_block, "if", label_id);
3244         LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3245         LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3246 }
3247
3248 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3249                  int label_id)
3250 {
3251         LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3252                                           value, ctx->f32_0, "");
3253         ac_build_ifcc(ctx, cond, label_id);
3254 }
3255
3256 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3257                   int label_id)
3258 {
3259         LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3260                                           ac_to_integer(ctx, value),
3261                                           ctx->i32_0, "");
3262         ac_build_ifcc(ctx, cond, label_id);
3263 }
3264
3265 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3266                              const char *name)
3267 {
3268         LLVMBuilderRef builder = ac->builder;
3269         LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3270         LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3271         LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3272         LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3273         LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3274         LLVMValueRef res;
3275
3276         if (first_instr) {
3277                 LLVMPositionBuilderBefore(first_builder, first_instr);
3278         } else {
3279                 LLVMPositionBuilderAtEnd(first_builder, first_block);
3280         }
3281
3282         res = LLVMBuildAlloca(first_builder, type, name);
3283         LLVMDisposeBuilder(first_builder);
3284         return res;
3285 }
3286
3287 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3288                                    LLVMTypeRef type, const char *name)
3289 {
3290         LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3291         LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3292         return ptr;
3293 }
3294
3295 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3296                          LLVMTypeRef type)
3297 {
3298         int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3299         return LLVMBuildBitCast(ctx->builder, ptr,
3300                                 LLVMPointerType(type, addr_space), "");
3301 }
3302
3303 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3304                             unsigned count)
3305 {
3306         unsigned num_components = ac_get_llvm_num_components(value);
3307         if (count == num_components)
3308                 return value;
3309
3310         LLVMValueRef masks[MAX2(count, 2)];
3311         masks[0] = ctx->i32_0;
3312         masks[1] = ctx->i32_1;
3313         for (unsigned i = 2; i < count; i++)
3314                 masks[i] = LLVMConstInt(ctx->i32, i, false);
3315
3316         if (count == 1)
3317                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3318                                                "");
3319
3320         LLVMValueRef swizzle = LLVMConstVector(masks, count);
3321         return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3322 }
3323
3324 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3325                              unsigned rshift, unsigned bitwidth)
3326 {
3327         LLVMValueRef value = param;
3328         if (rshift)
3329                 value = LLVMBuildLShr(ctx->builder, value,
3330                                       LLVMConstInt(ctx->i32, rshift, false), "");
3331
3332         if (rshift + bitwidth < 32) {
3333                 unsigned mask = (1 << bitwidth) - 1;
3334                 value = LLVMBuildAnd(ctx->builder, value,
3335                                      LLVMConstInt(ctx->i32, mask, false), "");
3336         }
3337         return value;
3338 }
3339
3340 /* Adjust the sample index according to FMASK.
3341  *
3342  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3343  * which is the identity mapping. Each nibble says which physical sample
3344  * should be fetched to get that sample.
3345  *
3346  * For example, 0x11111100 means there are only 2 samples stored and
3347  * the second sample covers 3/4 of the pixel. When reading samples 0
3348  * and 1, return physical sample 0 (determined by the first two 0s
3349  * in FMASK), otherwise return physical sample 1.
3350  *
3351  * The sample index should be adjusted as follows:
3352  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3353  */
3354 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3355                               LLVMValueRef *addr, bool is_array_tex)
3356 {
3357         struct ac_image_args fmask_load = {};
3358         fmask_load.opcode = ac_image_load;
3359         fmask_load.resource = fmask;
3360         fmask_load.dmask = 0xf;
3361         fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3362         fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3363
3364         fmask_load.coords[0] = addr[0];
3365         fmask_load.coords[1] = addr[1];
3366         if (is_array_tex)
3367                 fmask_load.coords[2] = addr[2];
3368
3369         LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3370         fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3371                                               ac->i32_0, "");
3372
3373         /* Apply the formula. */
3374         unsigned sample_chan = is_array_tex ? 3 : 2;
3375         LLVMValueRef final_sample;
3376         final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3377                                     LLVMConstInt(ac->i32, 4, 0), "");
3378         final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3379         /* Mask the sample index by 0x7, because 0x8 means an unknown value
3380          * with EQAA, so those will map to 0. */
3381         final_sample = LLVMBuildAnd(ac->builder, final_sample,
3382                                     LLVMConstInt(ac->i32, 0x7, 0), "");
3383
3384         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3385          * resource descriptor is 0 (invalid).
3386          */
3387         LLVMValueRef tmp;
3388         tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3389         tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3390         tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3391
3392         /* Replace the MSAA sample index. */
3393         addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3394                                             addr[sample_chan], "");
3395 }
3396
3397 static LLVMValueRef
3398 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3399 {
3400         ac_build_optimization_barrier(ctx, &src);
3401         return ac_build_intrinsic(ctx,
3402                         lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3403                         LLVMTypeOf(src), (LLVMValueRef []) {
3404                         src, lane },
3405                         lane == NULL ? 1 : 2,
3406                         AC_FUNC_ATTR_READNONE |
3407                         AC_FUNC_ATTR_CONVERGENT);
3408 }
3409
3410 /**
3411  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3412  * @param ctx
3413  * @param src
3414  * @param lane - id of the lane or NULL for the first active lane
3415  * @return value of the lane
3416  */
3417 LLVMValueRef
3418 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3419 {
3420         LLVMTypeRef src_type = LLVMTypeOf(src);
3421         src = ac_to_integer(ctx, src);
3422         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3423         LLVMValueRef ret;
3424
3425         if (bits == 32) {
3426                 ret = _ac_build_readlane(ctx, src, lane);
3427         } else {
3428                 assert(bits % 32 == 0);
3429                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3430                 LLVMValueRef src_vector =
3431                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3432                 ret = LLVMGetUndef(vec_type);
3433                 for (unsigned i = 0; i < bits / 32; i++) {
3434                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3435                                                 LLVMConstInt(ctx->i32, i, 0), "");
3436                         LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3437                         ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3438                                                 LLVMConstInt(ctx->i32, i, 0), "");
3439                 }
3440         }
3441         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3442 }
3443
3444 LLVMValueRef
3445 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3446 {
3447         /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
3448          */
3449         LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
3450                                           ac_get_thread_id(ctx), "");
3451         return LLVMBuildSelect(ctx->builder, pred, value, src, "");
3452 }
3453
3454 LLVMValueRef
3455 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3456 {
3457         LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3458                                                  LLVMVectorType(ctx->i32, 2),
3459                                                  "");
3460         LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3461                                                        ctx->i32_0, "");
3462         LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3463                                                        ctx->i32_1, "");
3464         LLVMValueRef val =
3465                 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3466                                    (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3467                                    2, AC_FUNC_ATTR_READNONE);
3468         val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3469                                  (LLVMValueRef []) { mask_hi, val },
3470                                  2, AC_FUNC_ATTR_READNONE);
3471         return val;
3472 }
3473
3474 enum dpp_ctrl {
3475         _dpp_quad_perm = 0x000,
3476         _dpp_row_sl = 0x100,
3477         _dpp_row_sr = 0x110,
3478         _dpp_row_rr = 0x120,
3479         dpp_wf_sl1 = 0x130,
3480         dpp_wf_rl1 = 0x134,
3481         dpp_wf_sr1 = 0x138,
3482         dpp_wf_rr1 = 0x13C,
3483         dpp_row_mirror = 0x140,
3484         dpp_row_half_mirror = 0x141,
3485         dpp_row_bcast15 = 0x142,
3486         dpp_row_bcast31 = 0x143
3487 };
3488
3489 static inline enum dpp_ctrl
3490 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3491 {
3492         assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3493         return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3494 }
3495
3496 static inline enum dpp_ctrl
3497 dpp_row_sl(unsigned amount)
3498 {
3499         assert(amount > 0 && amount < 16);
3500         return _dpp_row_sl | amount;
3501 }
3502
3503 static inline enum dpp_ctrl
3504 dpp_row_sr(unsigned amount)
3505 {
3506         assert(amount > 0 && amount < 16);
3507         return _dpp_row_sr | amount;
3508 }
3509
3510 static LLVMValueRef
3511 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3512               enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3513               bool bound_ctrl)
3514 {
3515         return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3516                                         LLVMTypeOf(old),
3517                                         (LLVMValueRef[]) {
3518                                                 old, src,
3519                                                 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3520                                                 LLVMConstInt(ctx->i32, row_mask, 0),
3521                                                 LLVMConstInt(ctx->i32, bank_mask, 0),
3522                                                 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3523                                         6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3524 }
3525
3526 static LLVMValueRef
3527 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3528              enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3529              bool bound_ctrl)
3530 {
3531         LLVMTypeRef src_type = LLVMTypeOf(src);
3532         src = ac_to_integer(ctx, src);
3533         old = ac_to_integer(ctx, old);
3534         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3535         LLVMValueRef ret;
3536         if (bits == 32) {
3537                 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3538                                     bank_mask, bound_ctrl);
3539         } else {
3540                 assert(bits % 32 == 0);
3541                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3542                 LLVMValueRef src_vector =
3543                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3544                 LLVMValueRef old_vector =
3545                         LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3546                 ret = LLVMGetUndef(vec_type);
3547                 for (unsigned i = 0; i < bits / 32; i++) {
3548                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3549                                                       LLVMConstInt(ctx->i32, i,
3550                                                                    0), "");
3551                         old = LLVMBuildExtractElement(ctx->builder, old_vector,
3552                                                       LLVMConstInt(ctx->i32, i,
3553                                                                    0), "");
3554                         LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3555                                                               dpp_ctrl,
3556                                                               row_mask,
3557                                                               bank_mask,
3558                                                               bound_ctrl);
3559                         ret = LLVMBuildInsertElement(ctx->builder, ret,
3560                                                      ret_comp,
3561                                                      LLVMConstInt(ctx->i32, i,
3562                                                                   0), "");
3563                 }
3564         }
3565         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3566 }
3567
3568 static inline unsigned
3569 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3570 {
3571         assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3572         return and_mask | (or_mask << 5) | (xor_mask << 10);
3573 }
3574
3575 static LLVMValueRef
3576 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3577 {
3578         return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
3579                                    LLVMTypeOf(src), (LLVMValueRef []) {
3580                                         src, LLVMConstInt(ctx->i32, mask, 0) },
3581                                    2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3582 }
3583
3584 LLVMValueRef
3585 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3586 {
3587         LLVMTypeRef src_type = LLVMTypeOf(src);
3588         src = ac_to_integer(ctx, src);
3589         unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3590         LLVMValueRef ret;
3591         if (bits == 32) {
3592                 ret = _ac_build_ds_swizzle(ctx, src, mask);
3593         } else {
3594                 assert(bits % 32 == 0);
3595                 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3596                 LLVMValueRef src_vector =
3597                         LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3598                 ret = LLVMGetUndef(vec_type);
3599                 for (unsigned i = 0; i < bits / 32; i++) {
3600                         src = LLVMBuildExtractElement(ctx->builder, src_vector,
3601                                                       LLVMConstInt(ctx->i32, i,
3602                                                                    0), "");
3603                         LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
3604                                                                      mask);
3605                         ret = LLVMBuildInsertElement(ctx->builder, ret,
3606                                                      ret_comp,
3607                                                      LLVMConstInt(ctx->i32, i,
3608                                                                   0), "");
3609                 }
3610         }
3611         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3612 }
3613
3614 static LLVMValueRef
3615 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3616 {
3617         char name[32], type[8];
3618         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3619         snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3620         return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
3621                                   (LLVMValueRef []) { src }, 1,
3622                                   AC_FUNC_ATTR_READNONE);
3623 }
3624
3625 static LLVMValueRef
3626 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3627                       LLVMValueRef inactive)
3628 {
3629         char name[33], type[8];
3630         LLVMTypeRef src_type = LLVMTypeOf(src);
3631         src = ac_to_integer(ctx, src);
3632         inactive = ac_to_integer(ctx, inactive);
3633         ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3634         snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3635         LLVMValueRef ret =
3636                 ac_build_intrinsic(ctx, name,
3637                                         LLVMTypeOf(src), (LLVMValueRef []) {
3638                                         src, inactive }, 2,
3639                                         AC_FUNC_ATTR_READNONE |
3640                                         AC_FUNC_ATTR_CONVERGENT);
3641         return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3642 }
3643
3644 static LLVMValueRef
3645 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
3646 {
3647         if (type_size == 4) {
3648                 switch (op) {
3649                 case nir_op_iadd: return ctx->i32_0;
3650                 case nir_op_fadd: return ctx->f32_0;
3651                 case nir_op_imul: return ctx->i32_1;
3652                 case nir_op_fmul: return ctx->f32_1;
3653                 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3654                 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3655                 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
3656                 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3657                 case nir_op_umax: return ctx->i32_0;
3658                 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
3659                 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
3660                 case nir_op_ior: return ctx->i32_0;
3661                 case nir_op_ixor: return ctx->i32_0;
3662                 default:
3663                         unreachable("bad reduction intrinsic");
3664                 }
3665         } else { /* type_size == 64bit */
3666                 switch (op) {
3667                 case nir_op_iadd: return ctx->i64_0;
3668                 case nir_op_fadd: return ctx->f64_0;
3669                 case nir_op_imul: return ctx->i64_1;
3670                 case nir_op_fmul: return ctx->f64_1;
3671                 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3672                 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3673                 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
3674                 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3675                 case nir_op_umax: return ctx->i64_0;
3676                 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
3677                 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
3678                 case nir_op_ior: return ctx->i64_0;
3679                 case nir_op_ixor: return ctx->i64_0;
3680                 default:
3681                         unreachable("bad reduction intrinsic");
3682                 }
3683         }
3684 }
3685
3686 static LLVMValueRef
3687 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
3688 {
3689         bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3690         switch (op) {
3691         case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3692         case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3693         case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3694         case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3695         case nir_op_imin: return LLVMBuildSelect(ctx->builder,
3696                                         LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3697                                         lhs, rhs, "");
3698         case nir_op_umin: return LLVMBuildSelect(ctx->builder,
3699                                         LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3700                                         lhs, rhs, "");
3701         case nir_op_fmin: return ac_build_intrinsic(ctx,
3702                                         _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
3703                                         _64bit ? ctx->f64 : ctx->f32,
3704                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3705         case nir_op_imax: return LLVMBuildSelect(ctx->builder,
3706                                         LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3707                                         lhs, rhs, "");
3708         case nir_op_umax: return LLVMBuildSelect(ctx->builder,
3709                                         LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3710                                         lhs, rhs, "");
3711         case nir_op_fmax: return ac_build_intrinsic(ctx,
3712                                         _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
3713                                         _64bit ? ctx->f64 : ctx->f32,
3714                                         (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3715         case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3716         case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3717         case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3718         default:
3719                 unreachable("bad reduction intrinsic");
3720         }
3721 }
3722
3723 /**
3724  * \param maxprefix specifies that the result only needs to be correct for a
3725  *     prefix of this many threads
3726  *
3727  * TODO: add inclusive and excluse scan functions for SI chip class.
3728  */
3729 static LLVMValueRef
3730 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
3731               unsigned maxprefix)
3732 {
3733         LLVMValueRef result, tmp;
3734         result = src;
3735         if (maxprefix <= 1)
3736                 return result;
3737         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3738         result = ac_build_alu_op(ctx, result, tmp, op);
3739         if (maxprefix <= 2)
3740                 return result;
3741         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3742         result = ac_build_alu_op(ctx, result, tmp, op);
3743         if (maxprefix <= 3)
3744                 return result;
3745         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3746         result = ac_build_alu_op(ctx, result, tmp, op);
3747         if (maxprefix <= 4)
3748                 return result;
3749         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3750         result = ac_build_alu_op(ctx, result, tmp, op);
3751         if (maxprefix <= 8)
3752                 return result;
3753         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3754         result = ac_build_alu_op(ctx, result, tmp, op);
3755         if (maxprefix <= 16)
3756                 return result;
3757         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3758         result = ac_build_alu_op(ctx, result, tmp, op);
3759         if (maxprefix <= 32)
3760                 return result;
3761         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3762         result = ac_build_alu_op(ctx, result, tmp, op);
3763         return result;
3764 }
3765
3766 LLVMValueRef
3767 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3768 {
3769         LLVMValueRef result;
3770
3771         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3772                 LLVMBuilderRef builder = ctx->builder;
3773                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3774                 result = ac_build_ballot(ctx, src);
3775                 result = ac_build_mbcnt(ctx, result);
3776                 result = LLVMBuildAdd(builder, result, src, "");
3777                 return result;
3778         }
3779
3780         ac_build_optimization_barrier(ctx, &src);
3781
3782         LLVMValueRef identity =
3783                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3784         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3785                                   LLVMTypeOf(identity), "");
3786         result = ac_build_scan(ctx, op, result, identity, 64);
3787
3788         return ac_build_wwm(ctx, result);
3789 }
3790
3791 LLVMValueRef
3792 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3793 {
3794         LLVMValueRef result;
3795
3796         if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3797                 LLVMBuilderRef builder = ctx->builder;
3798                 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3799                 result = ac_build_ballot(ctx, src);
3800                 result = ac_build_mbcnt(ctx, result);
3801                 return result;
3802         }
3803
3804         ac_build_optimization_barrier(ctx, &src);
3805
3806         LLVMValueRef identity =
3807                 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3808         result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3809                                   LLVMTypeOf(identity), "");
3810         result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
3811         result = ac_build_scan(ctx, op, result, identity, 64);
3812
3813         return ac_build_wwm(ctx, result);
3814 }
3815
3816 LLVMValueRef
3817 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
3818 {
3819         if (cluster_size == 1) return src;
3820         ac_build_optimization_barrier(ctx, &src);
3821         LLVMValueRef result, swap;
3822         LLVMValueRef identity = get_reduction_identity(ctx, op,
3823                                                                 ac_get_type_size(LLVMTypeOf(src)));
3824         result = LLVMBuildBitCast(ctx->builder,
3825                                                                 ac_build_set_inactive(ctx, src, identity),
3826                                                                 LLVMTypeOf(identity), "");
3827         swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3828         result = ac_build_alu_op(ctx, result, swap, op);
3829         if (cluster_size == 2) return ac_build_wwm(ctx, result);
3830
3831         swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3832         result = ac_build_alu_op(ctx, result, swap, op);
3833         if (cluster_size == 4) return ac_build_wwm(ctx, result);
3834
3835         if (ctx->chip_class >= VI)
3836                 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3837         else
3838                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3839         result = ac_build_alu_op(ctx, result, swap, op);
3840         if (cluster_size == 8) return ac_build_wwm(ctx, result);
3841
3842         if (ctx->chip_class >= VI)
3843                 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3844         else
3845                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3846         result = ac_build_alu_op(ctx, result, swap, op);
3847         if (cluster_size == 16) return ac_build_wwm(ctx, result);
3848
3849         if (ctx->chip_class >= VI && cluster_size != 32)
3850                 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3851         else
3852                 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3853         result = ac_build_alu_op(ctx, result, swap, op);
3854         if (cluster_size == 32) return ac_build_wwm(ctx, result);
3855
3856         if (ctx->chip_class >= VI) {
3857                 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3858                 result = ac_build_alu_op(ctx, result, swap, op);
3859                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3860                 return ac_build_wwm(ctx, result);
3861         } else {
3862                 swap = ac_build_readlane(ctx, result, ctx->i32_0);
3863                 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3864                 result = ac_build_alu_op(ctx, result, swap, op);
3865                 return ac_build_wwm(ctx, result);
3866         }
3867 }
3868
3869 /**
3870  * "Top half" of a scan that reduces per-wave values across an entire
3871  * workgroup.
3872  *
3873  * The source value must be present in the highest lane of the wave, and the
3874  * highest lane must be live.
3875  */
3876 void
3877 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3878 {
3879         if (ws->maxwaves <= 1)
3880                 return;
3881
3882         const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
3883         LLVMBuilderRef builder = ctx->builder;
3884         LLVMValueRef tid = ac_get_thread_id(ctx);
3885         LLVMValueRef tmp;
3886
3887         tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
3888         ac_build_ifcc(ctx, tmp, 1000);
3889         LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
3890         ac_build_endif(ctx, 1000);
3891 }
3892
3893 /**
3894  * "Bottom half" of a scan that reduces per-wave values across an entire
3895  * workgroup.
3896  *
3897  * The caller must place a barrier between the top and bottom halves.
3898  */
3899 void
3900 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3901 {
3902         const LLVMTypeRef type = LLVMTypeOf(ws->src);
3903         const LLVMValueRef identity =
3904                 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
3905
3906         if (ws->maxwaves <= 1) {
3907                 ws->result_reduce = ws->src;
3908                 ws->result_inclusive = ws->src;
3909                 ws->result_exclusive = identity;
3910                 return;
3911         }
3912         assert(ws->maxwaves <= 32);
3913
3914         LLVMBuilderRef builder = ctx->builder;
3915         LLVMValueRef tid = ac_get_thread_id(ctx);
3916         LLVMBasicBlockRef bbs[2];
3917         LLVMValueRef phivalues_scan[2];
3918         LLVMValueRef tmp, tmp2;
3919
3920         bbs[0] = LLVMGetInsertBlock(builder);
3921         phivalues_scan[0] = LLVMGetUndef(type);
3922
3923         if (ws->enable_reduce)
3924                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
3925         else if (ws->enable_inclusive)
3926                 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
3927         else
3928                 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
3929         ac_build_ifcc(ctx, tmp, 1001);
3930         {
3931                 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
3932
3933                 ac_build_optimization_barrier(ctx, &tmp);
3934
3935                 bbs[1] = LLVMGetInsertBlock(builder);
3936                 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
3937         }
3938         ac_build_endif(ctx, 1001);
3939
3940         const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
3941
3942         if (ws->enable_reduce) {
3943                 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
3944                 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
3945         }
3946         if (ws->enable_inclusive)
3947                 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
3948         if (ws->enable_exclusive) {
3949                 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
3950                 tmp = ac_build_readlane(ctx, scan, tmp);
3951                 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
3952                 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
3953         }
3954 }
3955
3956 /**
3957  * Inclusive scan of a per-wave value across an entire workgroup.
3958  *
3959  * This implies an s_barrier instruction.
3960  *
3961  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
3962  * of the workgroup are live. (This requirement cannot easily be relaxed in a
3963  * useful manner because of the barrier in the algorithm.)
3964  */
3965 void
3966 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3967 {
3968         ac_build_wg_wavescan_top(ctx, ws);
3969         ac_build_s_barrier(ctx);
3970         ac_build_wg_wavescan_bottom(ctx, ws);
3971 }
3972
3973 /**
3974  * "Top half" of a scan that reduces per-thread values across an entire
3975  * workgroup.
3976  *
3977  * All lanes must be active when this code runs.
3978  */
3979 void
3980 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3981 {
3982         if (ws->enable_exclusive) {
3983                 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
3984                 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
3985                         ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
3986                 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
3987         } else {
3988                 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
3989         }
3990
3991         bool enable_inclusive = ws->enable_inclusive;
3992         bool enable_exclusive = ws->enable_exclusive;
3993         ws->enable_inclusive = false;
3994         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
3995         ac_build_wg_wavescan_top(ctx, ws);
3996         ws->enable_inclusive = enable_inclusive;
3997         ws->enable_exclusive = enable_exclusive;
3998 }
3999
4000 /**
4001  * "Bottom half" of a scan that reduces per-thread values across an entire
4002  * workgroup.
4003  *
4004  * The caller must place a barrier between the top and bottom halves.
4005  */
4006 void
4007 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4008 {
4009         bool enable_inclusive = ws->enable_inclusive;
4010         bool enable_exclusive = ws->enable_exclusive;
4011         ws->enable_inclusive = false;
4012         ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4013         ac_build_wg_wavescan_bottom(ctx, ws);
4014         ws->enable_inclusive = enable_inclusive;
4015         ws->enable_exclusive = enable_exclusive;
4016
4017         /* ws->result_reduce is already the correct value */
4018         if (ws->enable_inclusive)
4019                 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
4020         if (ws->enable_exclusive)
4021                 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4022 }
4023
4024 /**
4025  * A scan that reduces per-thread values across an entire workgroup.
4026  *
4027  * The caller must ensure that all lanes are active when this code runs
4028  * (WWM is insufficient!), because there is an implied barrier.
4029  */
4030 void
4031 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4032 {
4033         ac_build_wg_scan_top(ctx, ws);
4034         ac_build_s_barrier(ctx);
4035         ac_build_wg_scan_bottom(ctx, ws);
4036 }
4037
4038 LLVMValueRef
4039 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
4040                 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
4041 {
4042         unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4043         if (ctx->chip_class >= VI) {
4044                 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4045         } else {
4046                 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4047         }
4048 }
4049
4050 LLVMValueRef
4051 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4052 {
4053         index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4054         return ac_build_intrinsic(ctx,
4055                   "llvm.amdgcn.ds.bpermute", ctx->i32,
4056                   (LLVMValueRef []) {index, src}, 2,
4057                   AC_FUNC_ATTR_READNONE |
4058                   AC_FUNC_ATTR_CONVERGENT);
4059 }
4060
4061 LLVMValueRef
4062 ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
4063                    unsigned bitsize)
4064 {
4065         LLVMTypeRef type;
4066         char *intr;
4067
4068         if (bitsize == 16) {
4069                 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4070                 type = ctx->i16;
4071         } else if (bitsize == 32) {
4072                 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4073                 type = ctx->i32;
4074         } else {
4075                 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4076                 type = ctx->i32;
4077         }
4078
4079         LLVMValueRef params[] = {
4080                 src0,
4081         };
4082         return ac_build_intrinsic(ctx, intr, type, params, 1,
4083                                   AC_FUNC_ATTR_READNONE);
4084 }
4085 LLVMValueRef
4086 ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
4087                     unsigned bitsize)
4088 {
4089         LLVMTypeRef type;
4090         char *intr;
4091
4092         if (bitsize == 16) {
4093                 intr = "llvm.amdgcn.frexp.mant.f16";
4094                 type = ctx->f16;
4095         } else if (bitsize == 32) {
4096                 intr = "llvm.amdgcn.frexp.mant.f32";
4097                 type = ctx->f32;
4098         } else {
4099                 intr = "llvm.amdgcn.frexp.mant.f64";
4100                 type = ctx->f64;
4101         }
4102
4103         LLVMValueRef params[] = {
4104                 src0,
4105         };
4106         return ac_build_intrinsic(ctx, intr, type, params, 1,
4107                                   AC_FUNC_ATTR_READNONE);
4108 }
4109
4110 /*
4111  * this takes an I,J coordinate pair,
4112  * and works out the X and Y derivatives.
4113  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4114  */
4115 LLVMValueRef
4116 ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4117 {
4118         LLVMValueRef result[4], a;
4119         unsigned i;
4120
4121         for (i = 0; i < 2; i++) {
4122                 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
4123                                             LLVMConstInt(ctx->i32, i, false), "");
4124                 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4125                 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4126         }
4127         return ac_build_gather_values(ctx, result, 4);
4128 }
4129
4130 LLVMValueRef
4131 ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4132 {
4133         LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
4134                                                  ctx->i1, NULL, 0,
4135                                                  AC_FUNC_ATTR_READNONE);
4136         result = LLVMBuildNot(ctx->builder, result, "");
4137         return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4138 }