src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c

   1 /*
   2  * Copyright 2017 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_shader_internal.h"
  26 #include "si_pipe.h"
  27 #include "sid.h"
  28 #include "tgsi/tgsi_build.h"
  29 #include "tgsi/tgsi_util.h"
  30 #include "ac_llvm_util.h"
  31
  32 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
  33                            struct lp_build_emit_data *emit_data,
  34                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
  35                            LLVMValueRef *fmask_ptr);
  36
  37 /**
  38  * Given a v8i32 resource descriptor for a buffer, extract the size of the
  39  * buffer in number of elements and return it as an i32.
  40  */
  41 static LLVMValueRef get_buffer_size(
  42         struct lp_build_tgsi_context *bld_base,
  43         LLVMValueRef descriptor)
  44 {
  45         struct si_shader_context *ctx = si_shader_context(bld_base);
  46         LLVMBuilderRef builder = ctx->ac.builder;
  47         LLVMValueRef size =
  48                 LLVMBuildExtractElement(builder, descriptor,
  49                                         LLVMConstInt(ctx->i32, 2, 0), "");
  50
  51         if (ctx->screen->info.chip_class == GFX8) {
  52                 /* On GFX8, the descriptor contains the size in bytes,
  53                  * but TXQ must return the size in elements.
  54                  * The stride is always non-zero for resources using TXQ.
  55                  */
  56                 LLVMValueRef stride =
  57                         LLVMBuildExtractElement(builder, descriptor,
  58                                                 ctx->i32_1, "");
  59                 stride = LLVMBuildLShr(builder, stride,
  60                                        LLVMConstInt(ctx->i32, 16, 0), "");
  61                 stride = LLVMBuildAnd(builder, stride,
  62                                       LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
  63
  64                 size = LLVMBuildUDiv(builder, size, stride, "");
  65         }
  66
  67         return size;
  68 }
  69
  70 static LLVMValueRef
  71 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
  72                          const struct tgsi_full_src_register *reg,
  73                          bool ubo)
  74 {
  75         LLVMValueRef index;
  76
  77         if (!reg->Register.Indirect) {
  78                 index = LLVMConstInt(ctx->i32, reg->Register.Index, false);
  79         } else {
  80                 index = si_get_indirect_index(ctx, &reg->Indirect,
  81                                               1, reg->Register.Index);
  82         }
  83
  84         if (ubo)
  85                 return ctx->abi.load_ubo(&ctx->abi, index);
  86         else
  87                 return ctx->abi.load_ssbo(&ctx->abi, index, false);
  88 }
  89
  90 static enum ac_image_dim
  91 ac_texture_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
  92 {
  93         switch (target) {
  94         case TGSI_TEXTURE_1D:
  95         case TGSI_TEXTURE_SHADOW1D:
  96                 if (screen->info.chip_class == GFX9)
  97                         return ac_image_2d;
  98                 return ac_image_1d;
  99         case TGSI_TEXTURE_2D:
 100         case TGSI_TEXTURE_SHADOW2D:
 101         case TGSI_TEXTURE_RECT:
 102         case TGSI_TEXTURE_SHADOWRECT:
 103                 return ac_image_2d;
 104         case TGSI_TEXTURE_3D:
 105                 return ac_image_3d;
 106         case TGSI_TEXTURE_CUBE:
 107         case TGSI_TEXTURE_SHADOWCUBE:
 108         case TGSI_TEXTURE_CUBE_ARRAY:
 109         case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
 110                 return ac_image_cube;
 111         case TGSI_TEXTURE_1D_ARRAY:
 112         case TGSI_TEXTURE_SHADOW1D_ARRAY:
 113                 if (screen->info.chip_class == GFX9)
 114                         return ac_image_2darray;
 115                 return ac_image_1darray;
 116         case TGSI_TEXTURE_2D_ARRAY:
 117         case TGSI_TEXTURE_SHADOW2D_ARRAY:
 118                 return ac_image_2darray;
 119         case TGSI_TEXTURE_2D_MSAA:
 120                 return ac_image_2dmsaa;
 121         case TGSI_TEXTURE_2D_ARRAY_MSAA:
 122                 return ac_image_2darraymsaa;
 123         default:
 124                 unreachable("unhandled texture type");
 125         }
 126 }
 127
 128 static enum ac_image_dim
 129 ac_image_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
 130 {
 131         enum ac_image_dim dim = ac_texture_dim_from_tgsi_target(screen, target);
 132
 133         /* Match the resource type set in the descriptor. */
 134         if (dim == ac_image_cube ||
 135             (screen->info.chip_class <= GFX8 && dim == ac_image_3d))
 136                 dim = ac_image_2darray;
 137         else if (target == TGSI_TEXTURE_2D && screen->info.chip_class == GFX9) {
 138                 /* When a single layer of a 3D texture is bound, the shader
 139                  * will refer to a 2D target, but the descriptor has a 3D type.
 140                  * Since the HW ignores BASE_ARRAY in this case, we need to
 141                  * send 3 coordinates. This doesn't hurt when the underlying
 142                  * texture is non-3D.
 143                  */
 144                 dim = ac_image_3d;
 145         }
 146
 147         return dim;
 148 }
 149
 150 /**
 151  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
 152  *
 153  * At least on Tonga, executing image stores on images with DCC enabled and
 154  * non-trivial can eventually lead to lockups. This can occur when an
 155  * application binds an image as read-only but then uses a shader that writes
 156  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
 157  * program termination) in this case, but it doesn't cost much to be a bit
 158  * nicer: disabling DCC in the shader still leads to undefined results but
 159  * avoids the lockup.
 160  */
 161 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
 162                                   LLVMValueRef rsrc)
 163 {
 164         if (ctx->screen->info.chip_class <= GFX7) {
 165                 return rsrc;
 166         } else {
 167                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
 168                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
 169                 LLVMValueRef tmp;
 170
 171                 tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
 172                 tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
 173                 return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
 174         }
 175 }
 176
 177 LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
 178                                 LLVMValueRef list, LLVMValueRef index,
 179                                 enum ac_descriptor_type desc_type,
 180                                 bool uses_store, bool bindless)
 181 {
 182         LLVMBuilderRef builder = ctx->ac.builder;
 183         LLVMValueRef rsrc;
 184
 185         if (desc_type == AC_DESC_BUFFER) {
 186                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
 187                                       ctx->i32_1);
 188                 list = LLVMBuildPointerCast(builder, list,
 189                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
 190         } else {
 191                 assert(desc_type == AC_DESC_IMAGE);
 192         }
 193
 194         if (bindless)
 195                 rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
 196         else
 197                 rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
 198
 199         if (ctx->ac.chip_class <= GFX9 &&
 200             desc_type == AC_DESC_IMAGE && uses_store)
 201                 rsrc = force_dcc_off(ctx, rsrc);
 202         return rsrc;
 203 }
 204
 205 /**
 206  * Load the resource descriptor for \p image.
 207  */
 208 static void
 209 image_fetch_rsrc(
 210         struct lp_build_tgsi_context *bld_base,
 211         const struct tgsi_full_src_register *image,
 212         bool is_store, unsigned target,
 213         LLVMValueRef *rsrc)
 214 {
 215         struct si_shader_context *ctx = si_shader_context(bld_base);
 216         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
 217                                              ctx->param_samplers_and_images);
 218         LLVMValueRef index;
 219
 220         if (!image->Register.Indirect) {
 221                 index = LLVMConstInt(ctx->i32,
 222                                      si_get_image_slot(image->Register.Index), 0);
 223         } else {
 224                 /* From the GL_ARB_shader_image_load_store extension spec:
 225                  *
 226                  *    If a shader performs an image load, store, or atomic
 227                  *    operation using an image variable declared as an array,
 228                  *    and if the index used to select an individual element is
 229                  *    negative or greater than or equal to the size of the
 230                  *    array, the results of the operation are undefined but may
 231                  *    not lead to termination.
 232                  */
 233                 index = si_get_bounded_indirect_index(ctx, &image->Indirect,
 234                                                       image->Register.Index,
 235                                                       ctx->num_images);
 236                 index = LLVMBuildSub(ctx->ac.builder,
 237                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
 238                                      index, "");
 239         }
 240
 241         bool bindless = false;
 242
 243         if (image->Register.File != TGSI_FILE_IMAGE) {
 244                 /* Bindless descriptors are accessible from a different pair of
 245                  * user SGPR indices.
 246                  */
 247                 rsrc_ptr = LLVMGetParam(ctx->main_fn,
 248                                         ctx->param_bindless_samplers_and_images);
 249                 index = lp_build_emit_fetch_src(bld_base, image,
 250                                                 TGSI_TYPE_UNSIGNED, 0);
 251
 252                 /* For simplicity, bindless image descriptors use fixed
 253                  * 16-dword slots for now.
 254                  */
 255                 index = LLVMBuildMul(ctx->ac.builder, index,
 256                                      LLVMConstInt(ctx->i32, 2, 0), "");
 257                 bindless = true;
 258         }
 259
 260         *rsrc = si_load_image_desc(ctx, rsrc_ptr, index,
 261                                    target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE,
 262                                    is_store, bindless);
 263 }
 264
 265 static void image_fetch_coords(
 266                 struct lp_build_tgsi_context *bld_base,
 267                 const struct tgsi_full_instruction *inst,
 268                 unsigned src, LLVMValueRef desc,
 269                 LLVMValueRef *coords)
 270 {
 271         struct si_shader_context *ctx = si_shader_context(bld_base);
 272         LLVMBuilderRef builder = ctx->ac.builder;
 273         unsigned target = inst->Memory.Texture;
 274         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
 275         LLVMValueRef tmp;
 276         int chan;
 277
 278         if (target == TGSI_TEXTURE_2D_MSAA ||
 279             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
 280                 /* Need the sample index as well. */
 281                 num_coords++;
 282         }
 283
 284         for (chan = 0; chan < num_coords; ++chan) {
 285                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
 286                 tmp = ac_to_integer(&ctx->ac, tmp);
 287                 coords[chan] = tmp;
 288         }
 289
 290         if (ctx->screen->info.chip_class == GFX9) {
 291                 /* 1D textures are allocated and used as 2D on GFX9. */
 292                 if (target == TGSI_TEXTURE_1D) {
 293                         coords[1] = ctx->i32_0;
 294                 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
 295                         coords[2] = coords[1];
 296                         coords[1] = ctx->i32_0;
 297                 } else if (target == TGSI_TEXTURE_2D) {
 298                         /* The hw can't bind a slice of a 3D image as a 2D
 299                          * image, because it ignores BASE_ARRAY if the target
 300                          * is 3D. The workaround is to read BASE_ARRAY and set
 301                          * it as the 3rd address operand for all 2D images.
 302                          */
 303                         LLVMValueRef first_layer, const5, mask;
 304
 305                         const5 = LLVMConstInt(ctx->i32, 5, 0);
 306                         mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
 307                         first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
 308                         first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
 309
 310                         coords[2] = first_layer;
 311                 }
 312         }
 313 }
 314
 315 static unsigned get_cache_policy(struct si_shader_context *ctx,
 316                                  const struct tgsi_full_instruction *inst,
 317                                  bool atomic, bool may_store_unaligned,
 318                                  bool writeonly_memory)
 319 {
 320         unsigned cache_policy = 0;
 321
 322         if (!atomic &&
 323             /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores.
 324              * All store opcodes not aligned to a dword are affected.
 325              * The only way to get unaligned stores in radeonsi is through
 326              * shader images. */
 327             ((may_store_unaligned && ctx->screen->info.chip_class == GFX6) ||
 328              /* If this is write-only, don't keep data in L1 to prevent
 329               * evicting L1 cache lines that may be needed by other
 330               * instructions. */
 331              writeonly_memory ||
 332              inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))) {
 333                 cache_policy |= ac_glc;
 334         }
 335
 336         if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY)
 337                 cache_policy |= ac_slc;
 338
 339         return cache_policy;
 340 }
 341
 342 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
 343                                    const struct tgsi_full_instruction *inst,
 344                                    LLVMTypeRef type, int arg)
 345 {
 346         LLVMBuilderRef builder = ctx->ac.builder;
 347         LLVMValueRef offset, ptr;
 348         int addr_space;
 349
 350         offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
 351         offset = ac_to_integer(&ctx->ac, offset);
 352
 353         ptr = ctx->ac.lds;
 354         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
 355         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
 356         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
 357
 358         return ptr;
 359 }
 360
 361 static void load_emit_memory(
 362                 struct si_shader_context *ctx,
 363                 struct lp_build_emit_data *emit_data)
 364 {
 365         const struct tgsi_full_instruction *inst = emit_data->inst;
 366         unsigned writemask = inst->Dst[0].Register.WriteMask;
 367         LLVMValueRef channels[4], ptr, derived_ptr, index;
 368         int chan;
 369
 370         ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
 371
 372         for (chan = 0; chan < 4; ++chan) {
 373                 if (!(writemask & (1 << chan))) {
 374                         channels[chan] = LLVMGetUndef(ctx->f32);
 375                         continue;
 376                 }
 377
 378                 index = LLVMConstInt(ctx->i32, chan, 0);
 379                 derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
 380                 channels[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
 381         }
 382         emit_data->output[emit_data->chan] = ac_build_gather_values(&ctx->ac, channels, 4);
 383 }
 384
 385 /**
 386  * Return true if the memory accessed by a LOAD or STORE instruction is
 387  * read-only or write-only, respectively.
 388  *
 389  * \param shader_buffers_reverse_access_mask
 390  *      For LOAD, set this to (store | atomic) slot usage in the shader.
 391  *      For STORE, set this to (load | atomic) slot usage in the shader.
 392  * \param images_reverse_access_mask  Same as above, but for images.
 393  * \param bindless_buffer_reverse_access_mask  Same as above, but for bindless image buffers.
 394  * \param bindless_image_reverse_access_mask   Same as above, but for bindless images.
 395  */
 396 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
 397                                   const struct tgsi_shader_info *info,
 398                                   unsigned shader_buffers_reverse_access_mask,
 399                                   unsigned images_reverse_access_mask,
 400                                   bool bindless_buffer_reverse_access_mask,
 401                                   bool bindless_image_reverse_access_mask)
 402 {
 403         enum tgsi_file_type resource_file;
 404         unsigned resource_index;
 405         bool resource_indirect;
 406
 407         if (inst->Instruction.Opcode == TGSI_OPCODE_STORE) {
 408                 resource_file = inst->Dst[0].Register.File;
 409                 resource_index = inst->Dst[0].Register.Index;
 410                 resource_indirect = inst->Dst[0].Register.Indirect;
 411         } else {
 412                 resource_file = inst->Src[0].Register.File;
 413                 resource_index = inst->Src[0].Register.Index;
 414                 resource_indirect = inst->Src[0].Register.Indirect;
 415         }
 416
 417         assert(resource_file == TGSI_FILE_BUFFER ||
 418                resource_file == TGSI_FILE_IMAGE ||
 419                /* bindless image */
 420                resource_file == TGSI_FILE_INPUT ||
 421                resource_file == TGSI_FILE_OUTPUT ||
 422                resource_file == TGSI_FILE_CONSTANT ||
 423                resource_file == TGSI_FILE_TEMPORARY ||
 424                resource_file == TGSI_FILE_IMMEDIATE);
 425
 426         assert(resource_file != TGSI_FILE_BUFFER ||
 427                inst->Memory.Texture == TGSI_TEXTURE_BUFFER);
 428
 429         bool bindless = resource_file != TGSI_FILE_BUFFER &&
 430                         resource_file != TGSI_FILE_IMAGE;
 431
 432         /* RESTRICT means NOALIAS.
 433          * If there are no writes, we can assume the accessed memory is read-only.
 434          * If there are no reads, we can assume the accessed memory is write-only.
 435          */
 436         if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT && !bindless) {
 437                 unsigned reverse_access_mask;
 438
 439                 if (resource_file == TGSI_FILE_BUFFER) {
 440                         reverse_access_mask = shader_buffers_reverse_access_mask;
 441                 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 442                         reverse_access_mask = info->images_buffers &
 443                                               images_reverse_access_mask;
 444                 } else {
 445                         reverse_access_mask = ~info->images_buffers &
 446                                               images_reverse_access_mask;
 447                 }
 448
 449                 if (resource_indirect) {
 450                         if (!reverse_access_mask)
 451                                 return true;
 452                 } else {
 453                         if (!(reverse_access_mask &
 454                               (1u << resource_index)))
 455                                 return true;
 456                 }
 457         }
 458
 459         /* If there are no buffer writes (for both shader buffers & image
 460          * buffers), it implies that buffer memory is read-only.
 461          * If there are no buffer reads (for both shader buffers & image
 462          * buffers), it implies that buffer memory is write-only.
 463          *
 464          * Same for the case when there are no writes/reads for non-buffer
 465          * images.
 466          */
 467         if (resource_file == TGSI_FILE_BUFFER ||
 468             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 469                 if (!shader_buffers_reverse_access_mask &&
 470                     !(info->images_buffers & images_reverse_access_mask) &&
 471                     !bindless_buffer_reverse_access_mask)
 472                         return true;
 473         } else {
 474                 if (!(~info->images_buffers & images_reverse_access_mask) &&
 475                     !bindless_image_reverse_access_mask)
 476                         return true;
 477         }
 478         return false;
 479 }
 480
 481 static void load_emit(
 482                 const struct lp_build_tgsi_action *action,
 483                 struct lp_build_tgsi_context *bld_base,
 484                 struct lp_build_emit_data *emit_data)
 485 {
 486         struct si_shader_context *ctx = si_shader_context(bld_base);
 487         const struct tgsi_full_instruction * inst = emit_data->inst;
 488         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 489         bool can_speculate = false;
 490         LLVMValueRef vindex = ctx->i32_0;
 491         LLVMValueRef voffset = ctx->i32_0;
 492         struct ac_image_args args = {};
 493
 494         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 495                 load_emit_memory(ctx, emit_data);
 496                 return;
 497         }
 498
 499         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 500             inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 501                 bool ubo = inst->Src[0].Register.File == TGSI_FILE_CONSTBUF;
 502                 args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], ubo);
 503                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
 504         } else {
 505                 unsigned target = inst->Memory.Texture;
 506
 507                 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &args.resource);
 508                 image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
 509                 vindex = args.coords[0]; /* for buffers only */
 510         }
 511
 512         if (inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 513                 emit_data->output[emit_data->chan] =
 514                         ac_build_buffer_load(&ctx->ac, args.resource,
 515                                              util_last_bit(inst->Dst[0].Register.WriteMask),
 516                                              NULL, voffset, NULL, 0, 0, true, true);
 517                 return;
 518         }
 519
 520         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 521                 ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
 522
 523         can_speculate = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
 524                           is_oneway_access_only(inst, info,
 525                                                 info->shader_buffers_store |
 526                                                 info->shader_buffers_atomic,
 527                                                 info->images_store |
 528                                                 info->images_atomic,
 529                                                 info->uses_bindless_buffer_store |
 530                                                 info->uses_bindless_buffer_atomic,
 531                                                 info->uses_bindless_image_store |
 532                                                 info->uses_bindless_image_atomic);
 533         args.cache_policy = get_cache_policy(ctx, inst, false, false, false);
 534
 535         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 536                 /* Don't use SMEM for shader buffer loads, because LLVM doesn't
 537                  * select SMEM for SI.load.const with a non-constant offset, and
 538                  * constant offsets practically don't exist with shader buffers.
 539                  *
 540                  * Also, SI.load.const doesn't use inst_offset when it's lowered
 541                  * to VMEM, so we just end up with more VALU instructions in the end
 542                  * and no benefit.
 543                  *
 544                  * TODO: Remove this line once LLVM can select SMEM with a non-constant
 545                  *       offset, and can derive inst_offset when VMEM is selected.
 546                  *       After that, si_memory_barrier should invalidate sL1 for shader
 547                  *       buffers.
 548                  */
 549                 emit_data->output[emit_data->chan] =
 550                         ac_build_buffer_load(&ctx->ac, args.resource,
 551                                              util_last_bit(inst->Dst[0].Register.WriteMask),
 552                                              NULL, voffset, NULL, 0,
 553                                              args.cache_policy, can_speculate, false);
 554                 return;
 555         }
 556
 557         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 558                 unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
 559                 LLVMValueRef result =
 560                         ac_build_buffer_load_format(&ctx->ac,
 561                                                     args.resource,
 562                                                     vindex,
 563                                                     ctx->i32_0,
 564                                                     num_channels,
 565                                                     args.cache_policy,
 566                                                     can_speculate);
 567                 emit_data->output[emit_data->chan] =
 568                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
 569         } else {
 570                 args.opcode = ac_image_load;
 571                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 572                 args.attributes = ac_get_load_intr_attribs(can_speculate);
 573                 args.dmask = 0xf;
 574
 575                 emit_data->output[emit_data->chan] =
 576                         ac_build_image_opcode(&ctx->ac, &args);
 577         }
 578 }
 579
 580 static void store_emit_buffer(struct si_shader_context *ctx,
 581                               LLVMValueRef resource,
 582                               unsigned writemask,
 583                               LLVMValueRef value,
 584                               LLVMValueRef voffset,
 585                               unsigned cache_policy,
 586                               bool writeonly_memory)
 587 {
 588         LLVMBuilderRef builder = ctx->ac.builder;
 589         LLVMValueRef base_data = value;
 590         LLVMValueRef base_offset = voffset;
 591
 592         while (writemask) {
 593                 int start, count;
 594                 LLVMValueRef data, voff;
 595
 596                 u_bit_scan_consecutive_range(&writemask, &start, &count);
 597
 598                 if (count == 3 && ac_has_vec3_support(ctx->ac.chip_class, false)) {
 599                         LLVMValueRef values[3] = {
 600                                 LLVMBuildExtractElement(builder, base_data,
 601                                                         LLVMConstInt(ctx->i32, start, 0), ""),
 602                                 LLVMBuildExtractElement(builder, base_data,
 603                                                         LLVMConstInt(ctx->i32, start + 1, 0), ""),
 604                                 LLVMBuildExtractElement(builder, base_data,
 605                                                         LLVMConstInt(ctx->i32, start + 2, 0), ""),
 606                         };
 607                         data = ac_build_gather_values(&ctx->ac, values, 3);
 608                 } else if (count >= 3) {
 609                         data = base_data;
 610                 } else if (count == 2) {
 611                         LLVMValueRef values[2] = {
 612                                 LLVMBuildExtractElement(builder, base_data,
 613                                                         LLVMConstInt(ctx->i32, start, 0), ""),
 614                                 LLVMBuildExtractElement(builder, base_data,
 615                                                         LLVMConstInt(ctx->i32, start + 1, 0), ""),
 616                         };
 617
 618                         data = ac_build_gather_values(&ctx->ac, values, 2);
 619                 } else {
 620                         assert(count == 1);
 621                         data = LLVMBuildExtractElement(
 622                                 builder, base_data,
 623                                 LLVMConstInt(ctx->i32, start, 0), "");
 624                 }
 625
 626                 voff = base_offset;
 627                 if (start != 0) {
 628                         voff = LLVMBuildAdd(
 629                                 builder, voff,
 630                                 LLVMConstInt(ctx->i32, start * 4, 0), "");
 631                 }
 632
 633                 ac_build_buffer_store_dword(&ctx->ac, resource, data, count,
 634                                             voff, ctx->i32_0, 0, cache_policy,
 635                                             false);
 636         }
 637 }
 638
 639 static void store_emit_memory(
 640                 struct si_shader_context *ctx,
 641                 struct lp_build_emit_data *emit_data)
 642 {
 643         const struct tgsi_full_instruction *inst = emit_data->inst;
 644         LLVMBuilderRef builder = ctx->ac.builder;
 645         unsigned writemask = inst->Dst[0].Register.WriteMask;
 646         LLVMValueRef ptr, derived_ptr, data, index;
 647         int chan;
 648
 649         ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
 650
 651         for (chan = 0; chan < 4; ++chan) {
 652                 if (!(writemask & (1 << chan))) {
 653                         continue;
 654                 }
 655                 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
 656                 index = LLVMConstInt(ctx->i32, chan, 0);
 657                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
 658                 LLVMBuildStore(builder, data, derived_ptr);
 659         }
 660 }
 661
 662 static void store_emit(
 663                 const struct lp_build_tgsi_action *action,
 664                 struct lp_build_tgsi_context *bld_base,
 665                 struct lp_build_emit_data *emit_data)
 666 {
 667         struct si_shader_context *ctx = si_shader_context(bld_base);
 668         const struct tgsi_full_instruction * inst = emit_data->inst;
 669         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 670         struct tgsi_full_src_register resource_reg =
 671                 tgsi_full_src_register_from_dst(&inst->Dst[0]);
 672         unsigned target = inst->Memory.Texture;
 673
 674         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
 675                 store_emit_memory(ctx, emit_data);
 676                 return;
 677         }
 678
 679         bool writeonly_memory = is_oneway_access_only(inst, info,
 680                                                       info->shader_buffers_load |
 681                                                       info->shader_buffers_atomic,
 682                                                       info->images_load |
 683                                                       info->images_atomic,
 684                                                       info->uses_bindless_buffer_load |
 685                                                       info->uses_bindless_buffer_atomic,
 686                                                       info->uses_bindless_image_load |
 687                                                       info->uses_bindless_image_atomic);
 688         LLVMValueRef chans[4];
 689         LLVMValueRef vindex = ctx->i32_0;
 690         LLVMValueRef voffset = ctx->i32_0;
 691         struct ac_image_args args = {};
 692
 693         for (unsigned chan = 0; chan < 4; ++chan)
 694                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
 695
 696         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 697                 args.resource = shader_buffer_fetch_rsrc(ctx, &resource_reg, false);
 698                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 0, 0));
 699         } else {
 700                 image_fetch_rsrc(bld_base, &resource_reg, true, target, &args.resource);
 701                 image_fetch_coords(bld_base, inst, 0, args.resource, args.coords);
 702                 vindex = args.coords[0]; /* for buffers only */
 703         }
 704
 705         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 706                 ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
 707
 708         bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER;
 709         args.cache_policy = get_cache_policy(ctx, inst,
 710                                              false, /* atomic */
 711                                              is_image, /* may_store_unaligned */
 712                                              writeonly_memory);
 713
 714         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 715                 store_emit_buffer(ctx, args.resource, inst->Dst[0].Register.WriteMask,
 716                                   ac_build_gather_values(&ctx->ac, chans, 4),
 717                                   voffset, args.cache_policy, writeonly_memory);
 718                 return;
 719         }
 720
 721         if (target == TGSI_TEXTURE_BUFFER) {
 722                 unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
 723
 724                 ac_build_buffer_store_format(&ctx->ac, args.resource,
 725                                              ac_build_gather_values(&ctx->ac, chans, num_channels),
 726                                              vindex, ctx->i32_0 /* voffset */,
 727                                              num_channels,
 728                                              args.cache_policy);
 729         } else {
 730                 args.opcode = ac_image_store;
 731                 args.data[0] = ac_build_gather_values(&ctx->ac, chans, 4);
 732                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 733                 args.attributes = AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY;
 734                 args.dmask = 0xf;
 735
 736                 emit_data->output[emit_data->chan] =
 737                         ac_build_image_opcode(&ctx->ac, &args);
 738         }
 739 }
 740
 741 static void atomic_emit_memory(struct si_shader_context *ctx,
 742                                struct lp_build_emit_data *emit_data) {
 743         LLVMBuilderRef builder = ctx->ac.builder;
 744         const struct tgsi_full_instruction * inst = emit_data->inst;
 745         LLVMValueRef ptr, result, arg;
 746         const char *sync_scope = HAVE_LLVM >= 0x0900 ? "workgroup-one-as" : "workgroup";
 747
 748         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
 749
 750         arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
 751         arg = ac_to_integer(&ctx->ac, arg);
 752
 753         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 754                 LLVMValueRef new_data;
 755                 new_data = lp_build_emit_fetch(&ctx->bld_base,
 756                                                inst, 3, 0);
 757
 758                 new_data = ac_to_integer(&ctx->ac, new_data);
 759
 760                 result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, arg, new_data,
 761                                                   sync_scope);
 762                 result = LLVMBuildExtractValue(builder, result, 0, "");
 763         } else {
 764                 LLVMAtomicRMWBinOp op;
 765
 766                 switch(inst->Instruction.Opcode) {
 767                         case TGSI_OPCODE_ATOMUADD:
 768                                 op = LLVMAtomicRMWBinOpAdd;
 769                                 break;
 770                         case TGSI_OPCODE_ATOMXCHG:
 771                                 op = LLVMAtomicRMWBinOpXchg;
 772                                 break;
 773                         case TGSI_OPCODE_ATOMAND:
 774                                 op = LLVMAtomicRMWBinOpAnd;
 775                                 break;
 776                         case TGSI_OPCODE_ATOMOR:
 777                                 op = LLVMAtomicRMWBinOpOr;
 778                                 break;
 779                         case TGSI_OPCODE_ATOMXOR:
 780                                 op = LLVMAtomicRMWBinOpXor;
 781                                 break;
 782                         case TGSI_OPCODE_ATOMUMIN:
 783                                 op = LLVMAtomicRMWBinOpUMin;
 784                                 break;
 785                         case TGSI_OPCODE_ATOMUMAX:
 786                                 op = LLVMAtomicRMWBinOpUMax;
 787                                 break;
 788                         case TGSI_OPCODE_ATOMIMIN:
 789                                 op = LLVMAtomicRMWBinOpMin;
 790                                 break;
 791                         case TGSI_OPCODE_ATOMIMAX:
 792                                 op = LLVMAtomicRMWBinOpMax;
 793                                 break;
 794                         default:
 795                                 unreachable("unknown atomic opcode");
 796                 }
 797
 798                 result = ac_build_atomic_rmw(&ctx->ac, op, ptr, arg, sync_scope);
 799         }
 800         emit_data->output[emit_data->chan] =
 801                 LLVMBuildBitCast(builder, result, ctx->f32, "");
 802 }
 803
 804 static void atomic_emit(
 805                 const struct lp_build_tgsi_action *action,
 806                 struct lp_build_tgsi_context *bld_base,
 807                 struct lp_build_emit_data *emit_data)
 808 {
 809         struct si_shader_context *ctx = si_shader_context(bld_base);
 810         const struct tgsi_full_instruction * inst = emit_data->inst;
 811         struct ac_image_args args = {};
 812         unsigned num_data = 0;
 813         LLVMValueRef vindex = ctx->i32_0;
 814         LLVMValueRef voffset = ctx->i32_0;
 815
 816         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 817                 atomic_emit_memory(ctx, emit_data);
 818                 return;
 819         }
 820
 821         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 822                 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
 823                  * of arguments, which is reversed relative to TGSI (and GLSL)
 824                  */
 825                 args.data[num_data++] =
 826                         ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 3, 0));
 827         }
 828
 829         args.data[num_data++] =
 830                 ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
 831         args.cache_policy = get_cache_policy(ctx, inst, true, false, false);
 832
 833         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 834                 args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);
 835                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
 836         } else {
 837                 image_fetch_rsrc(bld_base, &inst->Src[0], true,
 838                                 inst->Memory.Texture, &args.resource);
 839                 image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
 840                 vindex = args.coords[0]; /* for buffers only */
 841         }
 842
 843         if (HAVE_LLVM >= 0x0800 &&
 844             inst->Src[0].Register.File != TGSI_FILE_BUFFER &&
 845             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 846                 LLVMValueRef buf_args[7];
 847                 unsigned num_args = 0;
 848
 849                 buf_args[num_args++] = args.data[0];
 850                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 851                         buf_args[num_args++] = args.data[1];
 852
 853                 buf_args[num_args++] = args.resource;
 854                 buf_args[num_args++] = vindex;
 855                 buf_args[num_args++] = voffset;
 856                 buf_args[num_args++] = ctx->i32_0; /* soffset */
 857                 buf_args[num_args++] = LLVMConstInt(ctx->i32, args.cache_policy & ac_slc, 0);
 858
 859                 char intrinsic_name[64];
 860                 snprintf(intrinsic_name, sizeof(intrinsic_name),
 861                          "llvm.amdgcn.struct.buffer.atomic.%s", action->intr_name);
 862                 emit_data->output[emit_data->chan] =
 863                         ac_to_float(&ctx->ac,
 864                                     ac_build_intrinsic(&ctx->ac, intrinsic_name,
 865                                                        ctx->i32, buf_args, num_args, 0));
 866                 return;
 867         }
 868
 869         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 870             (HAVE_LLVM < 0x0800 &&
 871              inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
 872                 LLVMValueRef buf_args[7];
 873                 unsigned num_args = 0;
 874
 875                 buf_args[num_args++] = args.data[0];
 876                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 877                         buf_args[num_args++] = args.data[1];
 878
 879                 buf_args[num_args++] = args.resource;
 880                 buf_args[num_args++] = vindex;
 881                 buf_args[num_args++] = voffset;
 882                 buf_args[num_args++] = args.cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
 883
 884                 char intrinsic_name[40];
 885                 snprintf(intrinsic_name, sizeof(intrinsic_name),
 886                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
 887                 emit_data->output[emit_data->chan] =
 888                         ac_to_float(&ctx->ac,
 889                                     ac_build_intrinsic(&ctx->ac, intrinsic_name,
 890                                                        ctx->i32, buf_args, num_args, 0));
 891         } else {
 892                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 893                         args.opcode = ac_image_atomic_cmpswap;
 894                 } else {
 895                         args.opcode = ac_image_atomic;
 896                         switch (inst->Instruction.Opcode) {
 897                         case TGSI_OPCODE_ATOMXCHG: args.atomic = ac_atomic_swap; break;
 898                         case TGSI_OPCODE_ATOMUADD: args.atomic = ac_atomic_add; break;
 899                         case TGSI_OPCODE_ATOMAND: args.atomic = ac_atomic_and; break;
 900                         case TGSI_OPCODE_ATOMOR: args.atomic = ac_atomic_or; break;
 901                         case TGSI_OPCODE_ATOMXOR: args.atomic = ac_atomic_xor; break;
 902                         case TGSI_OPCODE_ATOMUMIN: args.atomic = ac_atomic_umin; break;
 903                         case TGSI_OPCODE_ATOMUMAX: args.atomic = ac_atomic_umax; break;
 904                         case TGSI_OPCODE_ATOMIMIN: args.atomic = ac_atomic_smin; break;
 905                         case TGSI_OPCODE_ATOMIMAX: args.atomic = ac_atomic_smax; break;
 906                         default: unreachable("unhandled image atomic");
 907                         }
 908                 }
 909
 910                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 911                 emit_data->output[emit_data->chan] =
 912                         ac_to_float(&ctx->ac, ac_build_image_opcode(&ctx->ac, &args));
 913         }
 914 }
 915
 916 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
 917                                 unsigned target, LLVMValueRef out)
 918 {
 919         LLVMBuilderRef builder = ctx->ac.builder;
 920
 921         /* 1D textures are allocated and used as 2D on GFX9. */
 922         if (ctx->screen->info.chip_class == GFX9 &&
 923             (target == TGSI_TEXTURE_1D_ARRAY ||
 924              target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
 925                 LLVMValueRef layers =
 926                         LLVMBuildExtractElement(builder, out,
 927                                                 LLVMConstInt(ctx->i32, 2, 0), "");
 928                 out = LLVMBuildInsertElement(builder, out, layers,
 929                                              ctx->i32_1, "");
 930         }
 931
 932         /* Divide the number of layers by 6 to get the number of cubes. */
 933         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 934             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
 935                 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
 936
 937                 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
 938                 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
 939
 940                 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
 941         }
 942         return out;
 943 }
 944
 945 static void resq_emit(
 946                 const struct lp_build_tgsi_action *action,
 947                 struct lp_build_tgsi_context *bld_base,
 948                 struct lp_build_emit_data *emit_data)
 949 {
 950         struct si_shader_context *ctx = si_shader_context(bld_base);
 951         LLVMBuilderRef builder = ctx->ac.builder;
 952         const struct tgsi_full_instruction *inst = emit_data->inst;
 953         const struct tgsi_full_src_register *reg =
 954                 &inst->Src[inst->Instruction.Opcode == TGSI_OPCODE_TXQ ? 1 : 0];
 955
 956         if (reg->Register.File == TGSI_FILE_BUFFER) {
 957                 LLVMValueRef rsrc = shader_buffer_fetch_rsrc(ctx, reg, false);
 958
 959                 emit_data->output[emit_data->chan] =
 960                         LLVMBuildExtractElement(builder, rsrc,
 961                                                 LLVMConstInt(ctx->i32, 2, 0), "");
 962                 return;
 963         }
 964
 965         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
 966             inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
 967                 LLVMValueRef rsrc;
 968
 969                 tex_fetch_ptrs(bld_base, emit_data, &rsrc, NULL, NULL);
 970                 /* Read the size from the buffer descriptor directly. */
 971                 emit_data->output[emit_data->chan] =
 972                         get_buffer_size(bld_base, rsrc);
 973                 return;
 974         }
 975
 976         if (inst->Instruction.Opcode == TGSI_OPCODE_RESQ &&
 977             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 978                 LLVMValueRef rsrc;
 979
 980                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, &rsrc);
 981                 emit_data->output[emit_data->chan] =
 982                         get_buffer_size(bld_base, rsrc);
 983                 return;
 984         }
 985
 986         unsigned target;
 987
 988         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
 989                 target = inst->Texture.Texture;
 990         } else {
 991                 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
 992                         target = TGSI_TEXTURE_2D_ARRAY;
 993                 else
 994                         target = inst->Memory.Texture;
 995         }
 996
 997         struct ac_image_args args = {};
 998         args.opcode = ac_image_get_resinfo;
 999         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1000         args.dmask = 0xf;
1001         args.attributes = AC_FUNC_ATTR_READNONE;
1002
1003         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1004                 tex_fetch_ptrs(bld_base, emit_data, &args.resource, NULL, NULL);
1005                 args.lod = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1006         } else {
1007                 image_fetch_rsrc(bld_base, reg, false, target, &args.resource);
1008                 args.lod = ctx->i32_0;
1009         }
1010
1011         emit_data->output[emit_data->chan] =
1012                 fix_resinfo(ctx, target, ac_build_image_opcode(&ctx->ac, &args));
1013 }
1014
1015 /**
1016  * Load an image view, fmask view. or sampler state descriptor.
1017  */
1018 LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
1019                                   LLVMValueRef list, LLVMValueRef index,
1020                                   enum ac_descriptor_type type)
1021 {
1022         LLVMBuilderRef builder = ctx->ac.builder;
1023
1024         switch (type) {
1025         case AC_DESC_IMAGE:
1026                 /* The image is at [0:7]. */
1027                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1028                 break;
1029         case AC_DESC_BUFFER:
1030                 /* The buffer is in [4:7]. */
1031                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1032                                       ctx->i32_1);
1033                 list = LLVMBuildPointerCast(builder, list,
1034                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1035                 break;
1036         case AC_DESC_FMASK:
1037                 /* The FMASK is at [8:15]. */
1038                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
1039                                       ctx->i32_1);
1040                 break;
1041         case AC_DESC_SAMPLER:
1042                 /* The sampler state is at [12:15]. */
1043                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1044                                       LLVMConstInt(ctx->i32, 3, 0));
1045                 list = LLVMBuildPointerCast(builder, list,
1046                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1047                 break;
1048         case AC_DESC_PLANE_0:
1049         case AC_DESC_PLANE_1:
1050         case AC_DESC_PLANE_2:
1051                 /* Only used for the multiplane image support for Vulkan. Should
1052                  * never be reached in radeonsi.
1053                  */
1054                 unreachable("Plane descriptor requested in radeonsi.");
1055         }
1056
1057         return ac_build_load_to_sgpr(&ctx->ac, list, index);
1058 }
1059
1060 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
1061  *
1062  * GFX6-GFX7:
1063  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
1064  *   filtering manually. The driver sets img7 to a mask clearing
1065  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
1066  *     s_and_b32 samp0, samp0, img7
1067  *
1068  * GFX8:
1069  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
1070  */
1071 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
1072                                            LLVMValueRef res, LLVMValueRef samp)
1073 {
1074         LLVMValueRef img7, samp0;
1075
1076         if (ctx->screen->info.chip_class >= GFX8)
1077                 return samp;
1078
1079         img7 = LLVMBuildExtractElement(ctx->ac.builder, res,
1080                                        LLVMConstInt(ctx->i32, 7, 0), "");
1081         samp0 = LLVMBuildExtractElement(ctx->ac.builder, samp,
1082                                         ctx->i32_0, "");
1083         samp0 = LLVMBuildAnd(ctx->ac.builder, samp0, img7, "");
1084         return LLVMBuildInsertElement(ctx->ac.builder, samp, samp0,
1085                                       ctx->i32_0, "");
1086 }
1087
1088 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
1089                            struct lp_build_emit_data *emit_data,
1090                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
1091                            LLVMValueRef *fmask_ptr)
1092 {
1093         struct si_shader_context *ctx = si_shader_context(bld_base);
1094         LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
1095         const struct tgsi_full_instruction *inst = emit_data->inst;
1096         const struct tgsi_full_src_register *reg;
1097         unsigned target = inst->Texture.Texture;
1098         unsigned sampler_src;
1099         LLVMValueRef index;
1100
1101         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
1102         reg = &emit_data->inst->Src[sampler_src];
1103
1104         if (reg->Register.Indirect) {
1105                 index = si_get_bounded_indirect_index(ctx,
1106                                                       &reg->Indirect,
1107                                                       reg->Register.Index,
1108                                                       ctx->num_samplers);
1109                 index = LLVMBuildAdd(ctx->ac.builder, index,
1110                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
1111         } else {
1112                 index = LLVMConstInt(ctx->i32,
1113                                      si_get_sampler_slot(reg->Register.Index), 0);
1114         }
1115
1116         if (reg->Register.File != TGSI_FILE_SAMPLER) {
1117                 /* Bindless descriptors are accessible from a different pair of
1118                  * user SGPR indices.
1119                  */
1120                 list = LLVMGetParam(ctx->main_fn,
1121                                     ctx->param_bindless_samplers_and_images);
1122                 index = lp_build_emit_fetch_src(bld_base, reg,
1123                                                 TGSI_TYPE_UNSIGNED, 0);
1124
1125                 /* Since bindless handle arithmetic can contain an unsigned integer
1126                  * wraparound and si_load_sampler_desc assumes there isn't any,
1127                  * use GEP without "inbounds" (inside ac_build_pointer_add)
1128                  * to prevent incorrect code generation and hangs.
1129                  */
1130                 index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1131                 list = ac_build_pointer_add(&ctx->ac, list, index);
1132                 index = ctx->i32_0;
1133         }
1134
1135         if (target == TGSI_TEXTURE_BUFFER)
1136                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER);
1137         else
1138                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE);
1139
1140         if (samp_ptr)
1141                 *samp_ptr = NULL;
1142         if (fmask_ptr)
1143                 *fmask_ptr = NULL;
1144
1145         if (target == TGSI_TEXTURE_2D_MSAA ||
1146             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1147                 if (fmask_ptr)
1148                         *fmask_ptr = si_load_sampler_desc(ctx, list, index,
1149                                                           AC_DESC_FMASK);
1150         } else if (target != TGSI_TEXTURE_BUFFER) {
1151                 if (samp_ptr) {
1152                         *samp_ptr = si_load_sampler_desc(ctx, list, index,
1153                                                          AC_DESC_SAMPLER);
1154                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
1155                 }
1156         }
1157 }
1158
1159 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
1160  * incorrectly forces nearest filtering if the texture format is integer.
1161  * The only effect it has on Gather4, which always returns 4 texels for
1162  * bilinear filtering, is that the final coordinates are off by 0.5 of
1163  * the texel size.
1164  *
1165  * The workaround is to subtract 0.5 from the unnormalized coordinates,
1166  * or (0.5 / size) from the normalized coordinates.
1167  *
1168  * However, cube textures with 8_8_8_8 data formats require a different
1169  * workaround of overriding the num format to USCALED/SSCALED. This would lose
1170  * precision in 32-bit data formats, so it needs to be applied dynamically at
1171  * runtime. In this case, return an i1 value that indicates whether the
1172  * descriptor was overridden (and hence a fixup of the sampler result is needed).
1173  */
1174 static LLVMValueRef
1175 si_lower_gather4_integer(struct si_shader_context *ctx,
1176                          struct ac_image_args *args,
1177                          unsigned target,
1178                          enum tgsi_return_type return_type)
1179 {
1180         LLVMBuilderRef builder = ctx->ac.builder;
1181         LLVMValueRef wa_8888 = NULL;
1182         LLVMValueRef half_texel[2];
1183
1184         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1185                return_type == TGSI_RETURN_TYPE_UINT);
1186
1187         if (target == TGSI_TEXTURE_CUBE ||
1188             target == TGSI_TEXTURE_CUBE_ARRAY) {
1189                 LLVMValueRef formats;
1190                 LLVMValueRef data_format;
1191                 LLVMValueRef wa_formats;
1192
1193                 formats = LLVMBuildExtractElement(builder, args->resource, ctx->i32_1, "");
1194
1195                 data_format = LLVMBuildLShr(builder, formats,
1196                                             LLVMConstInt(ctx->i32, 20, false), "");
1197                 data_format = LLVMBuildAnd(builder, data_format,
1198                                            LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
1199                 wa_8888 = LLVMBuildICmp(
1200                         builder, LLVMIntEQ, data_format,
1201                         LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
1202                         "");
1203
1204                 uint32_t wa_num_format =
1205                         return_type == TGSI_RETURN_TYPE_UINT ?
1206                         S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) :
1207                         S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED);
1208                 wa_formats = LLVMBuildAnd(builder, formats,
1209                                           LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false),
1210                                           "");
1211                 wa_formats = LLVMBuildOr(builder, wa_formats,
1212                                         LLVMConstInt(ctx->i32, wa_num_format, false), "");
1213
1214                 formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, "");
1215                 args->resource = LLVMBuildInsertElement(
1216                         builder, args->resource, formats, ctx->i32_1, "");
1217         }
1218
1219         if (target == TGSI_TEXTURE_RECT ||
1220             target == TGSI_TEXTURE_SHADOWRECT) {
1221                 assert(!wa_8888);
1222                 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
1223         } else {
1224                 struct ac_image_args resinfo = {};
1225                 struct lp_build_if_state if_ctx;
1226
1227                 if (wa_8888) {
1228                         /* Skip the texture size query entirely if we don't need it. */
1229                         lp_build_if(&if_ctx, &ctx->gallivm, LLVMBuildNot(builder, wa_8888, ""));
1230                 }
1231
1232                 /* Query the texture size. */
1233                 resinfo.opcode = ac_image_get_resinfo;
1234                 resinfo.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1235                 resinfo.resource = args->resource;
1236                 resinfo.sampler = args->sampler;
1237                 resinfo.lod = ctx->ac.i32_0;
1238                 resinfo.dmask = 0xf;
1239                 resinfo.attributes = AC_FUNC_ATTR_READNONE;
1240
1241                 LLVMValueRef texsize =
1242                         fix_resinfo(ctx, target,
1243                                     ac_build_image_opcode(&ctx->ac, &resinfo));
1244
1245                 /* Compute -0.5 / size. */
1246                 for (unsigned c = 0; c < 2; c++) {
1247                         half_texel[c] =
1248                                 LLVMBuildExtractElement(builder, texsize,
1249                                                         LLVMConstInt(ctx->i32, c, 0), "");
1250                         half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
1251                         half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, half_texel[c]);
1252                         half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
1253                                                       LLVMConstReal(ctx->f32, -0.5), "");
1254                 }
1255
1256                 if (wa_8888) {
1257                         lp_build_endif(&if_ctx);
1258
1259                         LLVMBasicBlockRef bb[2] = { if_ctx.true_block, if_ctx.entry_block };
1260
1261                         for (unsigned c = 0; c < 2; c++) {
1262                                 LLVMValueRef values[2] = { half_texel[c], ctx->ac.f32_0 };
1263                                 half_texel[c] = ac_build_phi(&ctx->ac, ctx->f32, 2,
1264                                                              values, bb);
1265                         }
1266                 }
1267         }
1268
1269         for (unsigned c = 0; c < 2; c++) {
1270                 LLVMValueRef tmp;
1271                 tmp = ac_to_float(&ctx->ac, args->coords[c]);
1272                 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
1273                 args->coords[c] = ac_to_integer(&ctx->ac, tmp);
1274         }
1275
1276         return wa_8888;
1277 }
1278
1279 /* The second half of the cube texture 8_8_8_8 integer workaround: adjust the
1280  * result after the gather operation.
1281  */
1282 static LLVMValueRef
1283 si_fix_gather4_integer_result(struct si_shader_context *ctx,
1284                            LLVMValueRef result,
1285                            enum tgsi_return_type return_type,
1286                            LLVMValueRef wa)
1287 {
1288         LLVMBuilderRef builder = ctx->ac.builder;
1289
1290         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1291                return_type == TGSI_RETURN_TYPE_UINT);
1292
1293         for (unsigned chan = 0; chan < 4; ++chan) {
1294                 LLVMValueRef chanv = LLVMConstInt(ctx->i32, chan, false);
1295                 LLVMValueRef value;
1296                 LLVMValueRef wa_value;
1297
1298                 value = LLVMBuildExtractElement(builder, result, chanv, "");
1299
1300                 if (return_type == TGSI_RETURN_TYPE_UINT)
1301                         wa_value = LLVMBuildFPToUI(builder, value, ctx->i32, "");
1302                 else
1303                         wa_value = LLVMBuildFPToSI(builder, value, ctx->i32, "");
1304                 wa_value = ac_to_float(&ctx->ac, wa_value);
1305                 value = LLVMBuildSelect(builder, wa, wa_value, value, "");
1306
1307                 result = LLVMBuildInsertElement(builder, result, value, chanv, "");
1308         }
1309
1310         return result;
1311 }
1312
1313 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
1314                                 struct lp_build_tgsi_context *bld_base,
1315                                 struct lp_build_emit_data *emit_data)
1316 {
1317         struct si_shader_context *ctx = si_shader_context(bld_base);
1318         const struct tgsi_full_instruction *inst = emit_data->inst;
1319         unsigned opcode = inst->Instruction.Opcode;
1320         unsigned target = inst->Texture.Texture;
1321         struct ac_image_args args = {};
1322         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
1323         unsigned chan;
1324         bool has_offset = inst->Texture.NumOffsets > 0;
1325         LLVMValueRef fmask_ptr = NULL;
1326
1327         tex_fetch_ptrs(bld_base, emit_data, &args.resource, &args.sampler, &fmask_ptr);
1328
1329         if (target == TGSI_TEXTURE_BUFFER) {
1330                 LLVMValueRef vindex = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1331                 unsigned num_channels =
1332                         util_last_bit(inst->Dst[0].Register.WriteMask);
1333                 LLVMValueRef result =
1334                         ac_build_buffer_load_format(&ctx->ac,
1335                                                     args.resource,
1336                                                     vindex,
1337                                                     ctx->i32_0,
1338                                                     num_channels, 0, true);
1339                 emit_data->output[emit_data->chan] =
1340                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
1341                 return;
1342         }
1343
1344         /* Fetch and project texture coordinates */
1345         args.coords[3] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_W);
1346         for (chan = 0; chan < 3; chan++) {
1347                 args.coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
1348                 if (opcode == TGSI_OPCODE_TXP)
1349                         args.coords[chan] = ac_build_fdiv(&ctx->ac,
1350                                 args.coords[chan], args.coords[3]);
1351         }
1352
1353         if (opcode == TGSI_OPCODE_TXP)
1354                 args.coords[3] = ctx->ac.f32_1;
1355
1356         /* Pack offsets. */
1357         if (has_offset &&
1358             opcode != TGSI_OPCODE_TXF &&
1359             opcode != TGSI_OPCODE_TXF_LZ) {
1360                 /* The offsets are six-bit signed integers packed like this:
1361                  *   X=[5:0], Y=[13:8], and Z=[21:16].
1362                  */
1363                 LLVMValueRef offset[3], pack;
1364
1365                 assert(inst->Texture.NumOffsets == 1);
1366
1367                 for (chan = 0; chan < 3; chan++) {
1368                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base, inst, 0, chan);
1369                         offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
1370                                                     LLVMConstInt(ctx->i32, 0x3f, 0), "");
1371                         if (chan)
1372                                 offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
1373                                                             LLVMConstInt(ctx->i32, chan*8, 0), "");
1374                 }
1375
1376                 pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
1377                 pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
1378                 args.offset = pack;
1379         }
1380
1381         /* Pack LOD bias value */
1382         if (opcode == TGSI_OPCODE_TXB)
1383                 args.bias = args.coords[3];
1384         if (opcode == TGSI_OPCODE_TXB2)
1385                 args.bias = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1386
1387         /* Pack depth comparison value */
1388         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
1389                 LLVMValueRef z;
1390
1391                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1392                         z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1393                 } else {
1394                         assert(ref_pos >= 0);
1395                         z = args.coords[ref_pos];
1396                 }
1397
1398                 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
1399                  * OpenGL 4.5 spec says:
1400                  *
1401                  *    "If the texture’s internal format indicates a fixed-point
1402                  *     depth texture, then D_t and D_ref are clamped to the
1403                  *     range [0, 1]; otherwise no clamping is performed."
1404                  *
1405                  * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
1406                  * so the depth comparison value isn't clamped for Z16 and
1407                  * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
1408                  * an explicitly clamped 32-bit float format.
1409                  */
1410                 if (ctx->screen->info.chip_class >= GFX8 &&
1411                     ctx->screen->info.chip_class <= GFX9) {
1412                         LLVMValueRef upgraded;
1413                         LLVMValueRef clamped;
1414                         upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
1415                                                            LLVMConstInt(ctx->i32, 3, false), "");
1416                         upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded,
1417                                                  LLVMConstInt(ctx->i32, 29, false), "");
1418                         upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->i1, "");
1419                         clamped = ac_build_clamp(&ctx->ac, z);
1420                         z = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, z, "");
1421                 }
1422
1423                 args.compare = z;
1424         }
1425
1426         /* Pack user derivatives */
1427         if (opcode == TGSI_OPCODE_TXD) {
1428                 int param, num_src_deriv_channels, num_dst_deriv_channels;
1429
1430                 switch (target) {
1431                 case TGSI_TEXTURE_3D:
1432                         num_src_deriv_channels = 3;
1433                         num_dst_deriv_channels = 3;
1434                         break;
1435                 case TGSI_TEXTURE_2D:
1436                 case TGSI_TEXTURE_SHADOW2D:
1437                 case TGSI_TEXTURE_RECT:
1438                 case TGSI_TEXTURE_SHADOWRECT:
1439                 case TGSI_TEXTURE_2D_ARRAY:
1440                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
1441                         num_src_deriv_channels = 2;
1442                         num_dst_deriv_channels = 2;
1443                         break;
1444                 case TGSI_TEXTURE_CUBE:
1445                 case TGSI_TEXTURE_SHADOWCUBE:
1446                 case TGSI_TEXTURE_CUBE_ARRAY:
1447                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
1448                         /* Cube derivatives will be converted to 2D. */
1449                         num_src_deriv_channels = 3;
1450                         num_dst_deriv_channels = 3;
1451                         break;
1452                 case TGSI_TEXTURE_1D:
1453                 case TGSI_TEXTURE_SHADOW1D:
1454                 case TGSI_TEXTURE_1D_ARRAY:
1455                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
1456                         num_src_deriv_channels = 1;
1457
1458                         /* 1D textures are allocated and used as 2D on GFX9. */
1459                         if (ctx->screen->info.chip_class == GFX9) {
1460                                 num_dst_deriv_channels = 2;
1461                         } else {
1462                                 num_dst_deriv_channels = 1;
1463                         }
1464                         break;
1465                 default:
1466                         unreachable("invalid target");
1467                 }
1468
1469                 for (param = 0; param < 2; param++) {
1470                         for (chan = 0; chan < num_src_deriv_channels; chan++)
1471                                 args.derivs[param * num_dst_deriv_channels + chan] =
1472                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
1473
1474                         /* Fill in the rest with zeros. */
1475                         for (chan = num_src_deriv_channels;
1476                              chan < num_dst_deriv_channels; chan++)
1477                                 args.derivs[param * num_dst_deriv_channels + chan] =
1478                                         ctx->ac.f32_0;
1479                 }
1480         }
1481
1482         if (target == TGSI_TEXTURE_CUBE ||
1483             target == TGSI_TEXTURE_CUBE_ARRAY ||
1484             target == TGSI_TEXTURE_SHADOWCUBE ||
1485             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1486                 ac_prepare_cube_coords(&ctx->ac,
1487                                        opcode == TGSI_OPCODE_TXD,
1488                                        target == TGSI_TEXTURE_CUBE_ARRAY ||
1489                                        target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
1490                                        opcode == TGSI_OPCODE_LODQ,
1491                                        args.coords, args.derivs);
1492         } else if (tgsi_is_array_sampler(target) &&
1493                    opcode != TGSI_OPCODE_TXF &&
1494                    opcode != TGSI_OPCODE_TXF_LZ &&
1495                    ctx->screen->info.chip_class <= GFX8) {
1496                 unsigned array_coord = target == TGSI_TEXTURE_1D_ARRAY ? 1 : 2;
1497                 args.coords[array_coord] = ac_build_round(&ctx->ac, args.coords[array_coord]);
1498         }
1499
1500         /* 1D textures are allocated and used as 2D on GFX9. */
1501         if (ctx->screen->info.chip_class == GFX9) {
1502                 LLVMValueRef filler;
1503
1504                 /* Use 0.5, so that we don't sample the border color. */
1505                 if (opcode == TGSI_OPCODE_TXF ||
1506                     opcode == TGSI_OPCODE_TXF_LZ)
1507                         filler = ctx->i32_0;
1508                 else
1509                         filler = LLVMConstReal(ctx->f32, 0.5);
1510
1511                 if (target == TGSI_TEXTURE_1D ||
1512                     target == TGSI_TEXTURE_SHADOW1D) {
1513                         args.coords[1] = filler;
1514                 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
1515                            target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
1516                         args.coords[2] = args.coords[1];
1517                         args.coords[1] = filler;
1518                 }
1519         }
1520
1521         /* Pack LOD or sample index */
1522         if (opcode == TGSI_OPCODE_TXL)
1523                 args.lod = args.coords[3];
1524         else if (opcode == TGSI_OPCODE_TXL2)
1525                 args.lod = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1526         else if (opcode == TGSI_OPCODE_TXF) {
1527                 if (target == TGSI_TEXTURE_2D_MSAA) {
1528                         /* No LOD, but move sample index into the right place. */
1529                         args.coords[2] = args.coords[3];
1530                 } else if (target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
1531                         args.lod = args.coords[3];
1532                 }
1533         }
1534
1535         if ((target == TGSI_TEXTURE_2D_MSAA ||
1536              target == TGSI_TEXTURE_2D_ARRAY_MSAA) &&
1537             !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
1538                 ac_apply_fmask_to_sample(&ctx->ac, fmask_ptr, args.coords,
1539                                          target == TGSI_TEXTURE_2D_ARRAY_MSAA);
1540         }
1541
1542         if (opcode == TGSI_OPCODE_TXF ||
1543             opcode == TGSI_OPCODE_TXF_LZ) {
1544                 /* add tex offsets */
1545                 if (inst->Texture.NumOffsets) {
1546                         const struct tgsi_texture_offset *off = inst->TexOffsets;
1547
1548                         assert(inst->Texture.NumOffsets == 1);
1549
1550                         switch (target) {
1551                         case TGSI_TEXTURE_3D:
1552                                 args.coords[2] =
1553                                         LLVMBuildAdd(ctx->ac.builder, args.coords[2],
1554                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ], "");
1555                                 /* fall through */
1556                         case TGSI_TEXTURE_2D:
1557                         case TGSI_TEXTURE_SHADOW2D:
1558                         case TGSI_TEXTURE_RECT:
1559                         case TGSI_TEXTURE_SHADOWRECT:
1560                         case TGSI_TEXTURE_2D_ARRAY:
1561                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
1562                                 args.coords[1] =
1563                                         LLVMBuildAdd(ctx->ac.builder, args.coords[1],
1564                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY], "");
1565                                 /* fall through */
1566                         case TGSI_TEXTURE_1D:
1567                         case TGSI_TEXTURE_SHADOW1D:
1568                         case TGSI_TEXTURE_1D_ARRAY:
1569                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
1570                                 args.coords[0] =
1571                                         LLVMBuildAdd(ctx->ac.builder, args.coords[0],
1572                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX], "");
1573                                 break;
1574                                 /* texture offsets do not apply to other texture targets */
1575                         }
1576                 }
1577         }
1578
1579         if (opcode == TGSI_OPCODE_TG4) {
1580                 unsigned gather_comp = 0;
1581
1582                 /* DMASK was repurposed for GATHER4. 4 components are always
1583                  * returned and DMASK works like a swizzle - it selects
1584                  * the component to fetch. The only valid DMASK values are
1585                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
1586                  * (red,red,red,red) etc.) The ISA document doesn't mention
1587                  * this.
1588                  */
1589
1590                 /* Get the component index from src1.x for Gather4. */
1591                 if (!tgsi_is_shadow_target(target)) {
1592                         LLVMValueRef comp_imm;
1593                         struct tgsi_src_register src1 = inst->Src[1].Register;
1594
1595                         assert(src1.File == TGSI_FILE_IMMEDIATE);
1596
1597                         comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
1598                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
1599                         gather_comp = CLAMP(gather_comp, 0, 3);
1600                 }
1601
1602                 args.dmask = 1 << gather_comp;
1603         } else {
1604                 args.dmask = 0xf;
1605         }
1606
1607         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1608         args.unorm = target == TGSI_TEXTURE_RECT ||
1609                      target == TGSI_TEXTURE_SHADOWRECT;
1610         args.opcode = ac_image_sample;
1611
1612         switch (opcode) {
1613         case TGSI_OPCODE_TXF:
1614         case TGSI_OPCODE_TXF_LZ:
1615                 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
1616                               target == TGSI_TEXTURE_2D_MSAA ||
1617                               target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
1618                                       ac_image_load : ac_image_load_mip;
1619                 break;
1620         case TGSI_OPCODE_LODQ:
1621                 args.opcode = ac_image_get_lod;
1622                 break;
1623         case TGSI_OPCODE_TEX:
1624         case TGSI_OPCODE_TEX2:
1625         case TGSI_OPCODE_TXP:
1626                 if (ctx->type != PIPE_SHADER_FRAGMENT)
1627                         args.level_zero = true;
1628                 break;
1629         case TGSI_OPCODE_TEX_LZ:
1630                 args.level_zero = true;
1631                 break;
1632         case TGSI_OPCODE_TXB:
1633         case TGSI_OPCODE_TXB2:
1634                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
1635                 break;
1636         case TGSI_OPCODE_TXL:
1637         case TGSI_OPCODE_TXL2:
1638                 break;
1639         case TGSI_OPCODE_TXD:
1640                 break;
1641         case TGSI_OPCODE_TG4:
1642                 args.opcode = ac_image_gather4;
1643                 args.level_zero = true;
1644                 break;
1645         default:
1646                 assert(0);
1647                 return;
1648         }
1649
1650         /* The hardware needs special lowering for Gather4 with integer formats. */
1651         LLVMValueRef gather4_int_result_workaround = NULL;
1652
1653         if (ctx->screen->info.chip_class <= GFX8 &&
1654             opcode == TGSI_OPCODE_TG4) {
1655                 assert(inst->Texture.ReturnType != TGSI_RETURN_TYPE_UNKNOWN);
1656
1657                 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
1658                     inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
1659                         gather4_int_result_workaround =
1660                                 si_lower_gather4_integer(ctx, &args, target,
1661                                                          inst->Texture.ReturnType);
1662                 }
1663         }
1664
1665         args.attributes = AC_FUNC_ATTR_READNONE;
1666         LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
1667
1668         if (gather4_int_result_workaround) {
1669                 result = si_fix_gather4_integer_result(ctx, result,
1670                                                        inst->Texture.ReturnType,
1671                                                        gather4_int_result_workaround);
1672         }
1673
1674         emit_data->output[emit_data->chan] = result;
1675 }
1676
1677 static void si_llvm_emit_txqs(
1678         const struct lp_build_tgsi_action *action,
1679         struct lp_build_tgsi_context *bld_base,
1680         struct lp_build_emit_data *emit_data)
1681 {
1682         struct si_shader_context *ctx = si_shader_context(bld_base);
1683         LLVMValueRef res, samples;
1684         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
1685
1686         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
1687
1688         /* Read the samples from the descriptor directly. */
1689         res = LLVMBuildBitCast(ctx->ac.builder, res_ptr, ctx->v8i32, "");
1690         samples = LLVMBuildExtractElement(ctx->ac.builder, res,
1691                                           LLVMConstInt(ctx->i32, 3, 0), "");
1692         samples = LLVMBuildLShr(ctx->ac.builder, samples,
1693                                 LLVMConstInt(ctx->i32, 16, 0), "");
1694         samples = LLVMBuildAnd(ctx->ac.builder, samples,
1695                                LLVMConstInt(ctx->i32, 0xf, 0), "");
1696         samples = LLVMBuildShl(ctx->ac.builder, ctx->i32_1,
1697                                samples, "");
1698
1699         emit_data->output[emit_data->chan] = samples;
1700 }
1701
1702 static void si_llvm_emit_fbfetch(const struct lp_build_tgsi_action *action,
1703                                  struct lp_build_tgsi_context *bld_base,
1704                                  struct lp_build_emit_data *emit_data)
1705 {
1706         struct si_shader_context *ctx = si_shader_context(bld_base);
1707         struct ac_image_args args = {};
1708         LLVMValueRef ptr, image, fmask;
1709
1710         /* Ignore src0, because KHR_blend_func_extended disallows multiple render
1711          * targets.
1712          */
1713
1714         /* Load the image descriptor. */
1715         STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
1716         ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1717         ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
1718                                    ac_array_in_const32_addr_space(ctx->v8i32), "");
1719         image = ac_build_load_to_sgpr(&ctx->ac, ptr,
1720                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
1721
1722         unsigned chan = 0;
1723
1724         args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 0, 16);
1725
1726         if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1727                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 16, 16);
1728
1729         /* Get the current render target layer index. */
1730         if (ctx->shader->key.mono.u.ps.fbfetch_layered)
1731                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_ANCILLARY, 16, 11);
1732
1733         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1734                 args.coords[chan++] = si_get_sample_id(ctx);
1735
1736         if (ctx->shader->key.mono.u.ps.fbfetch_msaa &&
1737             !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
1738                 fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
1739                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
1740
1741                 ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
1742                                          ctx->shader->key.mono.u.ps.fbfetch_layered);
1743         }
1744
1745         args.opcode = ac_image_load;
1746         args.resource = image;
1747         args.dmask = 0xf;
1748         args.attributes = AC_FUNC_ATTR_READNONE;
1749
1750         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1751                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1752                         ac_image_2darraymsaa : ac_image_2dmsaa;
1753         else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1754                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1755                         ac_image_1darray : ac_image_1d;
1756         else
1757                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1758                         ac_image_2darray : ac_image_2d;
1759
1760         emit_data->output[emit_data->chan] =
1761                 ac_build_image_opcode(&ctx->ac, &args);
1762 }
1763
1764 /**
1765  * Setup actions for TGSI memory opcode, including texture opcodes.
1766  */
1767 void si_shader_context_init_mem(struct si_shader_context *ctx)
1768 {
1769         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1770
1771         bld_base->op_actions[TGSI_OPCODE_TEX].emit = build_tex_intrinsic;
1772         bld_base->op_actions[TGSI_OPCODE_TEX_LZ].emit = build_tex_intrinsic;
1773         bld_base->op_actions[TGSI_OPCODE_TEX2].emit = build_tex_intrinsic;
1774         bld_base->op_actions[TGSI_OPCODE_TXB].emit = build_tex_intrinsic;
1775         bld_base->op_actions[TGSI_OPCODE_TXB2].emit = build_tex_intrinsic;
1776         bld_base->op_actions[TGSI_OPCODE_TXD].emit = build_tex_intrinsic;
1777         bld_base->op_actions[TGSI_OPCODE_TXF].emit = build_tex_intrinsic;
1778         bld_base->op_actions[TGSI_OPCODE_TXF_LZ].emit = build_tex_intrinsic;
1779         bld_base->op_actions[TGSI_OPCODE_TXL].emit = build_tex_intrinsic;
1780         bld_base->op_actions[TGSI_OPCODE_TXL2].emit = build_tex_intrinsic;
1781         bld_base->op_actions[TGSI_OPCODE_TXP].emit = build_tex_intrinsic;
1782         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = resq_emit;
1783         bld_base->op_actions[TGSI_OPCODE_TG4].emit = build_tex_intrinsic;
1784         bld_base->op_actions[TGSI_OPCODE_LODQ].emit = build_tex_intrinsic;
1785         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
1786
1787         bld_base->op_actions[TGSI_OPCODE_FBFETCH].emit = si_llvm_emit_fbfetch;
1788
1789         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
1790         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
1791         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
1792
1793         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit;
1794         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
1795         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit;
1796         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
1797         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit;
1798         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
1799         bld_base->op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit;
1800         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
1801         bld_base->op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit;
1802         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
1803         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit;
1804         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
1805         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit;
1806         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
1807         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit;
1808         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
1809         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
1810         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
1811         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
1812         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
1813 }