src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c

   1 /*
   2  * Copyright 2017 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_shader_internal.h"
  26 #include "si_pipe.h"
  27 #include "sid.h"
  28 #include "tgsi/tgsi_build.h"
  29 #include "tgsi/tgsi_util.h"
  30 #include "ac_llvm_util.h"
  31
  32 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
  33                            struct lp_build_emit_data *emit_data,
  34                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
  35                            LLVMValueRef *fmask_ptr);
  36
  37 /**
  38  * Given a v8i32 resource descriptor for a buffer, extract the size of the
  39  * buffer in number of elements and return it as an i32.
  40  */
  41 static LLVMValueRef get_buffer_size(
  42         struct lp_build_tgsi_context *bld_base,
  43         LLVMValueRef descriptor)
  44 {
  45         struct si_shader_context *ctx = si_shader_context(bld_base);
  46         LLVMBuilderRef builder = ctx->ac.builder;
  47         LLVMValueRef size =
  48                 LLVMBuildExtractElement(builder, descriptor,
  49                                         LLVMConstInt(ctx->i32, 2, 0), "");
  50
  51         if (ctx->screen->info.chip_class == VI) {
  52                 /* On VI, the descriptor contains the size in bytes,
  53                  * but TXQ must return the size in elements.
  54                  * The stride is always non-zero for resources using TXQ.
  55                  */
  56                 LLVMValueRef stride =
  57                         LLVMBuildExtractElement(builder, descriptor,
  58                                                 ctx->i32_1, "");
  59                 stride = LLVMBuildLShr(builder, stride,
  60                                        LLVMConstInt(ctx->i32, 16, 0), "");
  61                 stride = LLVMBuildAnd(builder, stride,
  62                                       LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
  63
  64                 size = LLVMBuildUDiv(builder, size, stride, "");
  65         }
  66
  67         return size;
  68 }
  69
  70 static LLVMValueRef
  71 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
  72                          const struct tgsi_full_src_register *reg,
  73                          bool ubo)
  74 {
  75         LLVMValueRef index;
  76
  77         if (!reg->Register.Indirect) {
  78                 index = LLVMConstInt(ctx->i32, reg->Register.Index, false);
  79         } else {
  80                 index = si_get_indirect_index(ctx, &reg->Indirect,
  81                                               1, reg->Register.Index);
  82         }
  83
  84         if (ubo)
  85                 return ctx->abi.load_ubo(&ctx->abi, index);
  86         else
  87                 return ctx->abi.load_ssbo(&ctx->abi, index, false);
  88 }
  89
  90 static enum ac_image_dim
  91 ac_texture_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
  92 {
  93         switch (target) {
  94         case TGSI_TEXTURE_1D:
  95         case TGSI_TEXTURE_SHADOW1D:
  96                 if (screen->info.chip_class >= GFX9)
  97                         return ac_image_2d;
  98                 return ac_image_1d;
  99         case TGSI_TEXTURE_2D:
 100         case TGSI_TEXTURE_SHADOW2D:
 101         case TGSI_TEXTURE_RECT:
 102         case TGSI_TEXTURE_SHADOWRECT:
 103                 return ac_image_2d;
 104         case TGSI_TEXTURE_3D:
 105                 return ac_image_3d;
 106         case TGSI_TEXTURE_CUBE:
 107         case TGSI_TEXTURE_SHADOWCUBE:
 108         case TGSI_TEXTURE_CUBE_ARRAY:
 109         case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
 110                 return ac_image_cube;
 111         case TGSI_TEXTURE_1D_ARRAY:
 112         case TGSI_TEXTURE_SHADOW1D_ARRAY:
 113                 if (screen->info.chip_class >= GFX9)
 114                         return ac_image_2darray;
 115                 return ac_image_1darray;
 116         case TGSI_TEXTURE_2D_ARRAY:
 117         case TGSI_TEXTURE_SHADOW2D_ARRAY:
 118                 return ac_image_2darray;
 119         case TGSI_TEXTURE_2D_MSAA:
 120                 return ac_image_2dmsaa;
 121         case TGSI_TEXTURE_2D_ARRAY_MSAA:
 122                 return ac_image_2darraymsaa;
 123         default:
 124                 unreachable("unhandled texture type");
 125         }
 126 }
 127
 128 static enum ac_image_dim
 129 ac_image_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
 130 {
 131         enum ac_image_dim dim = ac_texture_dim_from_tgsi_target(screen, target);
 132
 133         /* Match the resource type set in the descriptor. */
 134         if (dim == ac_image_cube ||
 135             (screen->info.chip_class <= VI && dim == ac_image_3d))
 136                 dim = ac_image_2darray;
 137         else if (target == TGSI_TEXTURE_2D && screen->info.chip_class >= GFX9) {
 138                 /* When a single layer of a 3D texture is bound, the shader
 139                  * will refer to a 2D target, but the descriptor has a 3D type.
 140                  * Since the HW ignores BASE_ARRAY in this case, we need to
 141                  * send 3 coordinates. This doesn't hurt when the underlying
 142                  * texture is non-3D.
 143                  */
 144                 dim = ac_image_3d;
 145         }
 146
 147         return dim;
 148 }
 149
 150 /**
 151  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
 152  *
 153  * At least on Tonga, executing image stores on images with DCC enabled and
 154  * non-trivial can eventually lead to lockups. This can occur when an
 155  * application binds an image as read-only but then uses a shader that writes
 156  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
 157  * program termination) in this case, but it doesn't cost much to be a bit
 158  * nicer: disabling DCC in the shader still leads to undefined results but
 159  * avoids the lockup.
 160  */
 161 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
 162                                   LLVMValueRef rsrc)
 163 {
 164         if (ctx->screen->info.chip_class <= CIK) {
 165                 return rsrc;
 166         } else {
 167                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
 168                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
 169                 LLVMValueRef tmp;
 170
 171                 tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
 172                 tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
 173                 return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
 174         }
 175 }
 176
 177 LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
 178                                 LLVMValueRef list, LLVMValueRef index,
 179                                 enum ac_descriptor_type desc_type, bool dcc_off,
 180                                 bool bindless)
 181 {
 182         LLVMBuilderRef builder = ctx->ac.builder;
 183         LLVMValueRef rsrc;
 184
 185         if (desc_type == AC_DESC_BUFFER) {
 186                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
 187                                       ctx->i32_1);
 188                 list = LLVMBuildPointerCast(builder, list,
 189                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
 190         } else {
 191                 assert(desc_type == AC_DESC_IMAGE);
 192         }
 193
 194         if (bindless)
 195                 rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
 196         else
 197                 rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
 198
 199         if (desc_type == AC_DESC_IMAGE && dcc_off)
 200                 rsrc = force_dcc_off(ctx, rsrc);
 201         return rsrc;
 202 }
 203
 204 /**
 205  * Load the resource descriptor for \p image.
 206  */
 207 static void
 208 image_fetch_rsrc(
 209         struct lp_build_tgsi_context *bld_base,
 210         const struct tgsi_full_src_register *image,
 211         bool is_store, unsigned target,
 212         LLVMValueRef *rsrc)
 213 {
 214         struct si_shader_context *ctx = si_shader_context(bld_base);
 215         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
 216                                              ctx->param_samplers_and_images);
 217         LLVMValueRef index;
 218         bool dcc_off = is_store;
 219
 220         if (!image->Register.Indirect) {
 221                 const struct tgsi_shader_info *info = bld_base->info;
 222                 unsigned images_writemask = info->images_store |
 223                                             info->images_atomic;
 224
 225                 index = LLVMConstInt(ctx->i32,
 226                                      si_get_image_slot(image->Register.Index), 0);
 227
 228                 if (images_writemask & (1 << image->Register.Index))
 229                         dcc_off = true;
 230         } else {
 231                 /* From the GL_ARB_shader_image_load_store extension spec:
 232                  *
 233                  *    If a shader performs an image load, store, or atomic
 234                  *    operation using an image variable declared as an array,
 235                  *    and if the index used to select an individual element is
 236                  *    negative or greater than or equal to the size of the
 237                  *    array, the results of the operation are undefined but may
 238                  *    not lead to termination.
 239                  */
 240                 index = si_get_bounded_indirect_index(ctx, &image->Indirect,
 241                                                       image->Register.Index,
 242                                                       ctx->num_images);
 243                 index = LLVMBuildSub(ctx->ac.builder,
 244                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
 245                                      index, "");
 246         }
 247
 248         bool bindless = false;
 249
 250         if (image->Register.File != TGSI_FILE_IMAGE) {
 251                 /* Bindless descriptors are accessible from a different pair of
 252                  * user SGPR indices.
 253                  */
 254                 rsrc_ptr = LLVMGetParam(ctx->main_fn,
 255                                         ctx->param_bindless_samplers_and_images);
 256                 index = lp_build_emit_fetch_src(bld_base, image,
 257                                                 TGSI_TYPE_UNSIGNED, 0);
 258
 259                 /* For simplicity, bindless image descriptors use fixed
 260                  * 16-dword slots for now.
 261                  */
 262                 index = LLVMBuildMul(ctx->ac.builder, index,
 263                                      LLVMConstInt(ctx->i32, 2, 0), "");
 264                 bindless = true;
 265         }
 266
 267         *rsrc = si_load_image_desc(ctx, rsrc_ptr, index,
 268                                    target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE,
 269                                    dcc_off, bindless);
 270 }
 271
 272 static void image_fetch_coords(
 273                 struct lp_build_tgsi_context *bld_base,
 274                 const struct tgsi_full_instruction *inst,
 275                 unsigned src, LLVMValueRef desc,
 276                 LLVMValueRef *coords)
 277 {
 278         struct si_shader_context *ctx = si_shader_context(bld_base);
 279         LLVMBuilderRef builder = ctx->ac.builder;
 280         unsigned target = inst->Memory.Texture;
 281         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
 282         LLVMValueRef tmp;
 283         int chan;
 284
 285         if (target == TGSI_TEXTURE_2D_MSAA ||
 286             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
 287                 /* Need the sample index as well. */
 288                 num_coords++;
 289         }
 290
 291         for (chan = 0; chan < num_coords; ++chan) {
 292                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
 293                 tmp = ac_to_integer(&ctx->ac, tmp);
 294                 coords[chan] = tmp;
 295         }
 296
 297         if (ctx->screen->info.chip_class >= GFX9) {
 298                 /* 1D textures are allocated and used as 2D on GFX9. */
 299                 if (target == TGSI_TEXTURE_1D) {
 300                         coords[1] = ctx->i32_0;
 301                 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
 302                         coords[2] = coords[1];
 303                         coords[1] = ctx->i32_0;
 304                 } else if (target == TGSI_TEXTURE_2D) {
 305                         /* The hw can't bind a slice of a 3D image as a 2D
 306                          * image, because it ignores BASE_ARRAY if the target
 307                          * is 3D. The workaround is to read BASE_ARRAY and set
 308                          * it as the 3rd address operand for all 2D images.
 309                          */
 310                         LLVMValueRef first_layer, const5, mask;
 311
 312                         const5 = LLVMConstInt(ctx->i32, 5, 0);
 313                         mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
 314                         first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
 315                         first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
 316
 317                         coords[2] = first_layer;
 318                 }
 319         }
 320 }
 321
 322 static unsigned get_cache_policy(struct si_shader_context *ctx,
 323                                  const struct tgsi_full_instruction *inst,
 324                                  bool atomic, bool may_store_unaligned,
 325                                  bool writeonly_memory)
 326 {
 327         unsigned cache_policy = 0;
 328
 329         if (!atomic &&
 330             /* SI has a TC L1 bug causing corruption of 8bit/16bit stores.
 331              * All store opcodes not aligned to a dword are affected.
 332              * The only way to get unaligned stores in radeonsi is through
 333              * shader images. */
 334             ((may_store_unaligned && ctx->screen->info.chip_class == SI) ||
 335              /* If this is write-only, don't keep data in L1 to prevent
 336               * evicting L1 cache lines that may be needed by other
 337               * instructions. */
 338              writeonly_memory ||
 339              inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE)))
 340                 cache_policy |= ac_glc;
 341
 342         if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY)
 343                 cache_policy |= ac_slc;
 344
 345         return cache_policy;
 346 }
 347
 348 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
 349                                    const struct tgsi_full_instruction *inst,
 350                                    LLVMTypeRef type, int arg)
 351 {
 352         LLVMBuilderRef builder = ctx->ac.builder;
 353         LLVMValueRef offset, ptr;
 354         int addr_space;
 355
 356         offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
 357         offset = ac_to_integer(&ctx->ac, offset);
 358
 359         ptr = ctx->ac.lds;
 360         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
 361         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
 362         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
 363
 364         return ptr;
 365 }
 366
 367 static void load_emit_memory(
 368                 struct si_shader_context *ctx,
 369                 struct lp_build_emit_data *emit_data)
 370 {
 371         const struct tgsi_full_instruction *inst = emit_data->inst;
 372         unsigned writemask = inst->Dst[0].Register.WriteMask;
 373         LLVMValueRef channels[4], ptr, derived_ptr, index;
 374         int chan;
 375
 376         ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
 377
 378         for (chan = 0; chan < 4; ++chan) {
 379                 if (!(writemask & (1 << chan))) {
 380                         channels[chan] = LLVMGetUndef(ctx->f32);
 381                         continue;
 382                 }
 383
 384                 index = LLVMConstInt(ctx->i32, chan, 0);
 385                 derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
 386                 channels[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
 387         }
 388         emit_data->output[emit_data->chan] = ac_build_gather_values(&ctx->ac, channels, 4);
 389 }
 390
 391 /**
 392  * Return true if the memory accessed by a LOAD or STORE instruction is
 393  * read-only or write-only, respectively.
 394  *
 395  * \param shader_buffers_reverse_access_mask
 396  *      For LOAD, set this to (store | atomic) slot usage in the shader.
 397  *      For STORE, set this to (load | atomic) slot usage in the shader.
 398  * \param images_reverse_access_mask  Same as above, but for images.
 399  * \param bindless_buffer_reverse_access_mask  Same as above, but for bindless image buffers.
 400  * \param bindless_image_reverse_access_mask   Same as above, but for bindless images.
 401  */
 402 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
 403                                   const struct tgsi_shader_info *info,
 404                                   unsigned shader_buffers_reverse_access_mask,
 405                                   unsigned images_reverse_access_mask,
 406                                   bool bindless_buffer_reverse_access_mask,
 407                                   bool bindless_image_reverse_access_mask)
 408 {
 409         enum tgsi_file_type resource_file;
 410         unsigned resource_index;
 411         bool resource_indirect;
 412
 413         if (inst->Instruction.Opcode == TGSI_OPCODE_STORE) {
 414                 resource_file = inst->Dst[0].Register.File;
 415                 resource_index = inst->Dst[0].Register.Index;
 416                 resource_indirect = inst->Dst[0].Register.Indirect;
 417         } else {
 418                 resource_file = inst->Src[0].Register.File;
 419                 resource_index = inst->Src[0].Register.Index;
 420                 resource_indirect = inst->Src[0].Register.Indirect;
 421         }
 422
 423         assert(resource_file == TGSI_FILE_BUFFER ||
 424                resource_file == TGSI_FILE_IMAGE ||
 425                /* bindless image */
 426                resource_file == TGSI_FILE_INPUT ||
 427                resource_file == TGSI_FILE_OUTPUT ||
 428                resource_file == TGSI_FILE_CONSTANT ||
 429                resource_file == TGSI_FILE_TEMPORARY ||
 430                resource_file == TGSI_FILE_IMMEDIATE);
 431
 432         assert(resource_file != TGSI_FILE_BUFFER ||
 433                inst->Memory.Texture == TGSI_TEXTURE_BUFFER);
 434
 435         bool bindless = resource_file != TGSI_FILE_BUFFER &&
 436                         resource_file != TGSI_FILE_IMAGE;
 437
 438         /* RESTRICT means NOALIAS.
 439          * If there are no writes, we can assume the accessed memory is read-only.
 440          * If there are no reads, we can assume the accessed memory is write-only.
 441          */
 442         if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT && !bindless) {
 443                 unsigned reverse_access_mask;
 444
 445                 if (resource_file == TGSI_FILE_BUFFER) {
 446                         reverse_access_mask = shader_buffers_reverse_access_mask;
 447                 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 448                         reverse_access_mask = info->images_buffers &
 449                                               images_reverse_access_mask;
 450                 } else {
 451                         reverse_access_mask = ~info->images_buffers &
 452                                               images_reverse_access_mask;
 453                 }
 454
 455                 if (resource_indirect) {
 456                         if (!reverse_access_mask)
 457                                 return true;
 458                 } else {
 459                         if (!(reverse_access_mask &
 460                               (1u << resource_index)))
 461                                 return true;
 462                 }
 463         }
 464
 465         /* If there are no buffer writes (for both shader buffers & image
 466          * buffers), it implies that buffer memory is read-only.
 467          * If there are no buffer reads (for both shader buffers & image
 468          * buffers), it implies that buffer memory is write-only.
 469          *
 470          * Same for the case when there are no writes/reads for non-buffer
 471          * images.
 472          */
 473         if (resource_file == TGSI_FILE_BUFFER ||
 474             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 475                 if (!shader_buffers_reverse_access_mask &&
 476                     !(info->images_buffers & images_reverse_access_mask) &&
 477                     !bindless_buffer_reverse_access_mask)
 478                         return true;
 479         } else {
 480                 if (!(~info->images_buffers & images_reverse_access_mask) &&
 481                     !bindless_image_reverse_access_mask)
 482                         return true;
 483         }
 484         return false;
 485 }
 486
 487 static void load_emit(
 488                 const struct lp_build_tgsi_action *action,
 489                 struct lp_build_tgsi_context *bld_base,
 490                 struct lp_build_emit_data *emit_data)
 491 {
 492         struct si_shader_context *ctx = si_shader_context(bld_base);
 493         const struct tgsi_full_instruction * inst = emit_data->inst;
 494         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 495         bool can_speculate = false;
 496         LLVMValueRef vindex = ctx->i32_0;
 497         LLVMValueRef voffset = ctx->i32_0;
 498         struct ac_image_args args = {};
 499
 500         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 501                 load_emit_memory(ctx, emit_data);
 502                 return;
 503         }
 504
 505         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 506             inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 507                 bool ubo = inst->Src[0].Register.File == TGSI_FILE_CONSTBUF;
 508                 args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], ubo);
 509                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
 510         } else {
 511                 unsigned target = inst->Memory.Texture;
 512
 513                 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &args.resource);
 514                 image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
 515                 vindex = args.coords[0]; /* for buffers only */
 516         }
 517
 518         if (inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 519                 emit_data->output[emit_data->chan] =
 520                         ac_build_buffer_load(&ctx->ac, args.resource,
 521                                              util_last_bit(inst->Dst[0].Register.WriteMask),
 522                                              NULL, voffset, NULL, 0, 0, 0, true, true);
 523                 return;
 524         }
 525
 526         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 527                 ac_build_waitcnt(&ctx->ac, VM_CNT);
 528
 529         can_speculate = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
 530                           is_oneway_access_only(inst, info,
 531                                                 info->shader_buffers_store |
 532                                                 info->shader_buffers_atomic,
 533                                                 info->images_store |
 534                                                 info->images_atomic,
 535                                                 info->uses_bindless_buffer_store |
 536                                                 info->uses_bindless_buffer_atomic,
 537                                                 info->uses_bindless_image_store |
 538                                                 info->uses_bindless_image_atomic);
 539         args.cache_policy = get_cache_policy(ctx, inst, false, false, false);
 540
 541         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 542                 /* Don't use SMEM for shader buffer loads, because LLVM doesn't
 543                  * select SMEM for SI.load.const with a non-constant offset, and
 544                  * constant offsets practically don't exist with shader buffers.
 545                  *
 546                  * Also, SI.load.const doesn't use inst_offset when it's lowered
 547                  * to VMEM, so we just end up with more VALU instructions in the end
 548                  * and no benefit.
 549                  *
 550                  * TODO: Remove this line once LLVM can select SMEM with a non-constant
 551                  *       offset, and can derive inst_offset when VMEM is selected.
 552                  *       After that, si_memory_barrier should invalidate sL1 for shader
 553                  *       buffers.
 554                  */
 555                 emit_data->output[emit_data->chan] =
 556                         ac_build_buffer_load(&ctx->ac, args.resource,
 557                                              util_last_bit(inst->Dst[0].Register.WriteMask),
 558                                              NULL, voffset, NULL, 0,
 559                                              !!(args.cache_policy & ac_glc),
 560                                              !!(args.cache_policy & ac_slc),
 561                                              can_speculate, false);
 562                 return;
 563         }
 564
 565         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 566                 unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
 567                 LLVMValueRef result =
 568                         ac_build_buffer_load_format(&ctx->ac,
 569                                                     args.resource,
 570                                                     vindex,
 571                                                     ctx->i32_0,
 572                                                     num_channels,
 573                                                     !!(args.cache_policy & ac_glc),
 574                                                     can_speculate);
 575                 emit_data->output[emit_data->chan] =
 576                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
 577         } else {
 578                 args.opcode = ac_image_load;
 579                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 580                 args.attributes = ac_get_load_intr_attribs(can_speculate);
 581                 args.dmask = 0xf;
 582
 583                 emit_data->output[emit_data->chan] =
 584                         ac_build_image_opcode(&ctx->ac, &args);
 585         }
 586 }
 587
 588 static void store_emit_buffer(struct si_shader_context *ctx,
 589                               LLVMValueRef resource,
 590                               unsigned writemask,
 591                               LLVMValueRef value,
 592                               LLVMValueRef voffset,
 593                               unsigned cache_policy,
 594                               bool writeonly_memory)
 595 {
 596         LLVMBuilderRef builder = ctx->ac.builder;
 597         LLVMValueRef base_data = value;
 598         LLVMValueRef base_offset = voffset;
 599
 600         while (writemask) {
 601                 int start, count;
 602                 const char *intrinsic_name;
 603                 LLVMValueRef data, voff;
 604
 605                 u_bit_scan_consecutive_range(&writemask, &start, &count);
 606
 607                 /* Due to an LLVM limitation, split 3-element writes
 608                  * into a 2-element and a 1-element write. */
 609                 if (count == 3) {
 610                         writemask |= 1 << (start + 2);
 611                         count = 2;
 612                 }
 613
 614                 if (count == 4) {
 615                         data = base_data;
 616                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
 617                 } else if (count == 2) {
 618                         LLVMValueRef values[2] = {
 619                                 LLVMBuildExtractElement(builder, base_data,
 620                                                         LLVMConstInt(ctx->i32, start, 0), ""),
 621                                 LLVMBuildExtractElement(builder, base_data,
 622                                                         LLVMConstInt(ctx->i32, start + 1, 0), ""),
 623                         };
 624
 625                         data = ac_build_gather_values(&ctx->ac, values, 2);
 626                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
 627                 } else {
 628                         assert(count == 1);
 629                         data = LLVMBuildExtractElement(
 630                                 builder, base_data,
 631                                 LLVMConstInt(ctx->i32, start, 0), "");
 632                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
 633                 }
 634
 635                 voff = base_offset;
 636                 if (start != 0) {
 637                         voff = LLVMBuildAdd(
 638                                 builder, voff,
 639                                 LLVMConstInt(ctx->i32, start * 4, 0), "");
 640                 }
 641
 642                 LLVMValueRef args[] = {
 643                         data,
 644                         resource,
 645                         ctx->i32_0, /* vindex */
 646                         voff,
 647                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
 648                         LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0),
 649                 };
 650                 ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->voidt, args, 6,
 651                                    ac_get_store_intr_attribs(writeonly_memory));
 652         }
 653 }
 654
 655 static void store_emit_memory(
 656                 struct si_shader_context *ctx,
 657                 struct lp_build_emit_data *emit_data)
 658 {
 659         const struct tgsi_full_instruction *inst = emit_data->inst;
 660         LLVMBuilderRef builder = ctx->ac.builder;
 661         unsigned writemask = inst->Dst[0].Register.WriteMask;
 662         LLVMValueRef ptr, derived_ptr, data, index;
 663         int chan;
 664
 665         ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
 666
 667         for (chan = 0; chan < 4; ++chan) {
 668                 if (!(writemask & (1 << chan))) {
 669                         continue;
 670                 }
 671                 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
 672                 index = LLVMConstInt(ctx->i32, chan, 0);
 673                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
 674                 LLVMBuildStore(builder, data, derived_ptr);
 675         }
 676 }
 677
 678 static void store_emit(
 679                 const struct lp_build_tgsi_action *action,
 680                 struct lp_build_tgsi_context *bld_base,
 681                 struct lp_build_emit_data *emit_data)
 682 {
 683         struct si_shader_context *ctx = si_shader_context(bld_base);
 684         const struct tgsi_full_instruction * inst = emit_data->inst;
 685         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 686         struct tgsi_full_src_register resource_reg =
 687                 tgsi_full_src_register_from_dst(&inst->Dst[0]);
 688         unsigned target = inst->Memory.Texture;
 689
 690         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
 691                 store_emit_memory(ctx, emit_data);
 692                 return;
 693         }
 694
 695         bool writeonly_memory = is_oneway_access_only(inst, info,
 696                                                       info->shader_buffers_load |
 697                                                       info->shader_buffers_atomic,
 698                                                       info->images_load |
 699                                                       info->images_atomic,
 700                                                       info->uses_bindless_buffer_load |
 701                                                       info->uses_bindless_buffer_atomic,
 702                                                       info->uses_bindless_image_load |
 703                                                       info->uses_bindless_image_atomic);
 704         LLVMValueRef chans[4];
 705         LLVMValueRef vindex = ctx->i32_0;
 706         LLVMValueRef voffset = ctx->i32_0;
 707         struct ac_image_args args = {};
 708
 709         for (unsigned chan = 0; chan < 4; ++chan)
 710                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
 711
 712         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 713                 args.resource = shader_buffer_fetch_rsrc(ctx, &resource_reg, false);
 714                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 0, 0));
 715         } else {
 716                 image_fetch_rsrc(bld_base, &resource_reg, true, target, &args.resource);
 717                 image_fetch_coords(bld_base, inst, 0, args.resource, args.coords);
 718                 vindex = args.coords[0]; /* for buffers only */
 719         }
 720
 721         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 722                 ac_build_waitcnt(&ctx->ac, VM_CNT);
 723
 724         bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER;
 725         args.cache_policy = get_cache_policy(ctx, inst,
 726                                              false, /* atomic */
 727                                              is_image, /* may_store_unaligned */
 728                                              writeonly_memory);
 729
 730         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 731                 store_emit_buffer(ctx, args.resource, inst->Dst[0].Register.WriteMask,
 732                                   ac_build_gather_values(&ctx->ac, chans, 4),
 733                                   voffset, args.cache_policy, writeonly_memory);
 734                 return;
 735         }
 736
 737         if (target == TGSI_TEXTURE_BUFFER) {
 738                 unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
 739                 num_channels = util_next_power_of_two(num_channels);
 740
 741                 LLVMValueRef buf_args[6] = {
 742                         ac_build_gather_values(&ctx->ac, chans, 4),
 743                         args.resource,
 744                         vindex,
 745                         ctx->i32_0, /* voffset */
 746                 };
 747
 748                 if (HAVE_LLVM >= 0x0800) {
 749                         buf_args[4] = ctx->i32_0; /* soffset */
 750                         buf_args[5] = LLVMConstInt(ctx->i1, args.cache_policy, 0);
 751                 } else {
 752                         buf_args[4] = LLVMConstInt(ctx->i1, !!(args.cache_policy & ac_glc), 0);
 753                         buf_args[5] = LLVMConstInt(ctx->i1, !!(args.cache_policy & ac_slc), 0);
 754                 }
 755
 756                 const char *types[] = { "f32", "v2f32", "v4f32" };
 757                 char name[128];
 758
 759                 snprintf(name, sizeof(name), "%s.%s",
 760                          HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.struct.buffer.store.format" :
 761                                                "llvm.amdgcn.buffer.store.format",
 762                          types[CLAMP(num_channels, 1, 3) - 1]);
 763
 764                 emit_data->output[emit_data->chan] = ac_build_intrinsic(
 765                         &ctx->ac,
 766                         name,
 767                         ctx->voidt, buf_args, 6,
 768                         ac_get_store_intr_attribs(writeonly_memory));
 769         } else {
 770                 args.opcode = ac_image_store;
 771                 args.data[0] = ac_build_gather_values(&ctx->ac, chans, 4);
 772                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 773                 args.attributes = ac_get_store_intr_attribs(writeonly_memory);
 774                 args.dmask = 0xf;
 775
 776                 emit_data->output[emit_data->chan] =
 777                         ac_build_image_opcode(&ctx->ac, &args);
 778         }
 779 }
 780
 781 static void atomic_emit_memory(struct si_shader_context *ctx,
 782                                struct lp_build_emit_data *emit_data) {
 783         LLVMBuilderRef builder = ctx->ac.builder;
 784         const struct tgsi_full_instruction * inst = emit_data->inst;
 785         LLVMValueRef ptr, result, arg;
 786
 787         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
 788
 789         arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
 790         arg = ac_to_integer(&ctx->ac, arg);
 791
 792         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 793                 LLVMValueRef new_data;
 794                 new_data = lp_build_emit_fetch(&ctx->bld_base,
 795                                                inst, 3, 0);
 796
 797                 new_data = ac_to_integer(&ctx->ac, new_data);
 798
 799                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
 800                                        LLVMAtomicOrderingSequentiallyConsistent,
 801                                        LLVMAtomicOrderingSequentiallyConsistent,
 802                                        false);
 803
 804                 result = LLVMBuildExtractValue(builder, result, 0, "");
 805         } else {
 806                 LLVMAtomicRMWBinOp op;
 807
 808                 switch(inst->Instruction.Opcode) {
 809                         case TGSI_OPCODE_ATOMUADD:
 810                                 op = LLVMAtomicRMWBinOpAdd;
 811                                 break;
 812                         case TGSI_OPCODE_ATOMXCHG:
 813                                 op = LLVMAtomicRMWBinOpXchg;
 814                                 break;
 815                         case TGSI_OPCODE_ATOMAND:
 816                                 op = LLVMAtomicRMWBinOpAnd;
 817                                 break;
 818                         case TGSI_OPCODE_ATOMOR:
 819                                 op = LLVMAtomicRMWBinOpOr;
 820                                 break;
 821                         case TGSI_OPCODE_ATOMXOR:
 822                                 op = LLVMAtomicRMWBinOpXor;
 823                                 break;
 824                         case TGSI_OPCODE_ATOMUMIN:
 825                                 op = LLVMAtomicRMWBinOpUMin;
 826                                 break;
 827                         case TGSI_OPCODE_ATOMUMAX:
 828                                 op = LLVMAtomicRMWBinOpUMax;
 829                                 break;
 830                         case TGSI_OPCODE_ATOMIMIN:
 831                                 op = LLVMAtomicRMWBinOpMin;
 832                                 break;
 833                         case TGSI_OPCODE_ATOMIMAX:
 834                                 op = LLVMAtomicRMWBinOpMax;
 835                                 break;
 836                         default:
 837                                 unreachable("unknown atomic opcode");
 838                 }
 839
 840                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
 841                                        LLVMAtomicOrderingSequentiallyConsistent,
 842                                        false);
 843         }
 844         emit_data->output[emit_data->chan] =
 845                 LLVMBuildBitCast(builder, result, ctx->f32, "");
 846 }
 847
 848 static void atomic_emit(
 849                 const struct lp_build_tgsi_action *action,
 850                 struct lp_build_tgsi_context *bld_base,
 851                 struct lp_build_emit_data *emit_data)
 852 {
 853         struct si_shader_context *ctx = si_shader_context(bld_base);
 854         const struct tgsi_full_instruction * inst = emit_data->inst;
 855         struct ac_image_args args = {};
 856         unsigned num_data = 0;
 857         LLVMValueRef vindex = ctx->i32_0;
 858         LLVMValueRef voffset = ctx->i32_0;
 859
 860         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 861                 atomic_emit_memory(ctx, emit_data);
 862                 return;
 863         }
 864
 865         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 866                 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
 867                  * of arguments, which is reversed relative to TGSI (and GLSL)
 868                  */
 869                 args.data[num_data++] =
 870                         ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 3, 0));
 871         }
 872
 873         args.data[num_data++] =
 874                 ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
 875         args.cache_policy = get_cache_policy(ctx, inst, true, false, false);
 876
 877         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 878                 args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);
 879                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
 880         } else {
 881                 image_fetch_rsrc(bld_base, &inst->Src[0], true,
 882                                 inst->Memory.Texture, &args.resource);
 883                 image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
 884                 vindex = args.coords[0]; /* for buffers only */
 885         }
 886
 887         if (HAVE_LLVM >= 0x0800 &&
 888             inst->Src[0].Register.File != TGSI_FILE_BUFFER &&
 889             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 890                 LLVMValueRef buf_args[7];
 891                 unsigned num_args = 0;
 892
 893                 buf_args[num_args++] = args.data[0];
 894                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 895                         buf_args[num_args++] = args.data[1];
 896
 897                 buf_args[num_args++] = args.resource;
 898                 buf_args[num_args++] = vindex;
 899                 buf_args[num_args++] = voffset;
 900                 buf_args[num_args++] = ctx->i32_0; /* soffset */
 901                 buf_args[num_args++] = LLVMConstInt(ctx->i32, args.cache_policy & ac_slc, 0);
 902
 903                 char intrinsic_name[64];
 904                 snprintf(intrinsic_name, sizeof(intrinsic_name),
 905                          "llvm.amdgcn.struct.buffer.atomic.%s", action->intr_name);
 906                 emit_data->output[emit_data->chan] =
 907                         ac_to_float(&ctx->ac,
 908                                     ac_build_intrinsic(&ctx->ac, intrinsic_name,
 909                                                        ctx->i32, buf_args, num_args, 0));
 910                 return;
 911         }
 912
 913         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 914             (HAVE_LLVM < 0x0800 &&
 915              inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
 916                 LLVMValueRef buf_args[7];
 917                 unsigned num_args = 0;
 918
 919                 buf_args[num_args++] = args.data[0];
 920                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 921                         buf_args[num_args++] = args.data[1];
 922
 923                 buf_args[num_args++] = args.resource;
 924                 buf_args[num_args++] = vindex;
 925                 buf_args[num_args++] = voffset;
 926                 buf_args[num_args++] = args.cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
 927
 928                 char intrinsic_name[40];
 929                 snprintf(intrinsic_name, sizeof(intrinsic_name),
 930                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
 931                 emit_data->output[emit_data->chan] =
 932                         ac_to_float(&ctx->ac,
 933                                     ac_build_intrinsic(&ctx->ac, intrinsic_name,
 934                                                        ctx->i32, buf_args, num_args, 0));
 935         } else {
 936                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 937                         args.opcode = ac_image_atomic_cmpswap;
 938                 } else {
 939                         args.opcode = ac_image_atomic;
 940                         switch (inst->Instruction.Opcode) {
 941                         case TGSI_OPCODE_ATOMXCHG: args.atomic = ac_atomic_swap; break;
 942                         case TGSI_OPCODE_ATOMUADD: args.atomic = ac_atomic_add; break;
 943                         case TGSI_OPCODE_ATOMAND: args.atomic = ac_atomic_and; break;
 944                         case TGSI_OPCODE_ATOMOR: args.atomic = ac_atomic_or; break;
 945                         case TGSI_OPCODE_ATOMXOR: args.atomic = ac_atomic_xor; break;
 946                         case TGSI_OPCODE_ATOMUMIN: args.atomic = ac_atomic_umin; break;
 947                         case TGSI_OPCODE_ATOMUMAX: args.atomic = ac_atomic_umax; break;
 948                         case TGSI_OPCODE_ATOMIMIN: args.atomic = ac_atomic_smin; break;
 949                         case TGSI_OPCODE_ATOMIMAX: args.atomic = ac_atomic_smax; break;
 950                         default: unreachable("unhandled image atomic");
 951                         }
 952                 }
 953
 954                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 955                 emit_data->output[emit_data->chan] =
 956                         ac_to_float(&ctx->ac, ac_build_image_opcode(&ctx->ac, &args));
 957         }
 958 }
 959
 960 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
 961                                 unsigned target, LLVMValueRef out)
 962 {
 963         LLVMBuilderRef builder = ctx->ac.builder;
 964
 965         /* 1D textures are allocated and used as 2D on GFX9. */
 966         if (ctx->screen->info.chip_class >= GFX9 &&
 967             (target == TGSI_TEXTURE_1D_ARRAY ||
 968              target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
 969                 LLVMValueRef layers =
 970                         LLVMBuildExtractElement(builder, out,
 971                                                 LLVMConstInt(ctx->i32, 2, 0), "");
 972                 out = LLVMBuildInsertElement(builder, out, layers,
 973                                              ctx->i32_1, "");
 974         }
 975
 976         /* Divide the number of layers by 6 to get the number of cubes. */
 977         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 978             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
 979                 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
 980
 981                 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
 982                 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
 983
 984                 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
 985         }
 986         return out;
 987 }
 988
 989 static void resq_emit(
 990                 const struct lp_build_tgsi_action *action,
 991                 struct lp_build_tgsi_context *bld_base,
 992                 struct lp_build_emit_data *emit_data)
 993 {
 994         struct si_shader_context *ctx = si_shader_context(bld_base);
 995         LLVMBuilderRef builder = ctx->ac.builder;
 996         const struct tgsi_full_instruction *inst = emit_data->inst;
 997         const struct tgsi_full_src_register *reg =
 998                 &inst->Src[inst->Instruction.Opcode == TGSI_OPCODE_TXQ ? 1 : 0];
 999
1000         if (reg->Register.File == TGSI_FILE_BUFFER) {
1001                 LLVMValueRef rsrc = shader_buffer_fetch_rsrc(ctx, reg, false);
1002
1003                 emit_data->output[emit_data->chan] =
1004                         LLVMBuildExtractElement(builder, rsrc,
1005                                                 LLVMConstInt(ctx->i32, 2, 0), "");
1006                 return;
1007         }
1008
1009         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
1010             inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
1011                 LLVMValueRef rsrc;
1012
1013                 tex_fetch_ptrs(bld_base, emit_data, &rsrc, NULL, NULL);
1014                 /* Read the size from the buffer descriptor directly. */
1015                 emit_data->output[emit_data->chan] =
1016                         get_buffer_size(bld_base, rsrc);
1017                 return;
1018         }
1019
1020         if (inst->Instruction.Opcode == TGSI_OPCODE_RESQ &&
1021             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
1022                 LLVMValueRef rsrc;
1023
1024                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, &rsrc);
1025                 emit_data->output[emit_data->chan] =
1026                         get_buffer_size(bld_base, rsrc);
1027                 return;
1028         }
1029
1030         unsigned target;
1031
1032         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1033                 target = inst->Texture.Texture;
1034         } else {
1035                 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
1036                         target = TGSI_TEXTURE_2D_ARRAY;
1037                 else
1038                         target = inst->Memory.Texture;
1039         }
1040
1041         struct ac_image_args args = {};
1042         args.opcode = ac_image_get_resinfo;
1043         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1044         args.dmask = 0xf;
1045
1046         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1047                 tex_fetch_ptrs(bld_base, emit_data, &args.resource, NULL, NULL);
1048                 args.lod = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1049         } else {
1050                 image_fetch_rsrc(bld_base, reg, false, target, &args.resource);
1051                 args.lod = ctx->i32_0;
1052         }
1053
1054         emit_data->output[emit_data->chan] =
1055                 fix_resinfo(ctx, target, ac_build_image_opcode(&ctx->ac, &args));
1056 }
1057
1058 /**
1059  * Load an image view, fmask view. or sampler state descriptor.
1060  */
1061 LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
1062                                   LLVMValueRef list, LLVMValueRef index,
1063                                   enum ac_descriptor_type type)
1064 {
1065         LLVMBuilderRef builder = ctx->ac.builder;
1066
1067         switch (type) {
1068         case AC_DESC_IMAGE:
1069                 /* The image is at [0:7]. */
1070                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1071                 break;
1072         case AC_DESC_BUFFER:
1073                 /* The buffer is in [4:7]. */
1074                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1075                                       ctx->i32_1);
1076                 list = LLVMBuildPointerCast(builder, list,
1077                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1078                 break;
1079         case AC_DESC_FMASK:
1080                 /* The FMASK is at [8:15]. */
1081                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
1082                                       ctx->i32_1);
1083                 break;
1084         case AC_DESC_SAMPLER:
1085                 /* The sampler state is at [12:15]. */
1086                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1087                                       LLVMConstInt(ctx->i32, 3, 0));
1088                 list = LLVMBuildPointerCast(builder, list,
1089                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1090                 break;
1091         }
1092
1093         return ac_build_load_to_sgpr(&ctx->ac, list, index);
1094 }
1095
1096 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
1097  *
1098  * SI-CI:
1099  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
1100  *   filtering manually. The driver sets img7 to a mask clearing
1101  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
1102  *     s_and_b32 samp0, samp0, img7
1103  *
1104  * VI:
1105  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
1106  */
1107 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
1108                                            LLVMValueRef res, LLVMValueRef samp)
1109 {
1110         LLVMValueRef img7, samp0;
1111
1112         if (ctx->screen->info.chip_class >= VI)
1113                 return samp;
1114
1115         img7 = LLVMBuildExtractElement(ctx->ac.builder, res,
1116                                        LLVMConstInt(ctx->i32, 7, 0), "");
1117         samp0 = LLVMBuildExtractElement(ctx->ac.builder, samp,
1118                                         ctx->i32_0, "");
1119         samp0 = LLVMBuildAnd(ctx->ac.builder, samp0, img7, "");
1120         return LLVMBuildInsertElement(ctx->ac.builder, samp, samp0,
1121                                       ctx->i32_0, "");
1122 }
1123
1124 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
1125                            struct lp_build_emit_data *emit_data,
1126                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
1127                            LLVMValueRef *fmask_ptr)
1128 {
1129         struct si_shader_context *ctx = si_shader_context(bld_base);
1130         LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
1131         const struct tgsi_full_instruction *inst = emit_data->inst;
1132         const struct tgsi_full_src_register *reg;
1133         unsigned target = inst->Texture.Texture;
1134         unsigned sampler_src;
1135         LLVMValueRef index;
1136
1137         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
1138         reg = &emit_data->inst->Src[sampler_src];
1139
1140         if (reg->Register.Indirect) {
1141                 index = si_get_bounded_indirect_index(ctx,
1142                                                       &reg->Indirect,
1143                                                       reg->Register.Index,
1144                                                       ctx->num_samplers);
1145                 index = LLVMBuildAdd(ctx->ac.builder, index,
1146                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
1147         } else {
1148                 index = LLVMConstInt(ctx->i32,
1149                                      si_get_sampler_slot(reg->Register.Index), 0);
1150         }
1151
1152         if (reg->Register.File != TGSI_FILE_SAMPLER) {
1153                 /* Bindless descriptors are accessible from a different pair of
1154                  * user SGPR indices.
1155                  */
1156                 list = LLVMGetParam(ctx->main_fn,
1157                                     ctx->param_bindless_samplers_and_images);
1158                 index = lp_build_emit_fetch_src(bld_base, reg,
1159                                                 TGSI_TYPE_UNSIGNED, 0);
1160
1161                 /* Since bindless handle arithmetic can contain an unsigned integer
1162                  * wraparound and si_load_sampler_desc assumes there isn't any,
1163                  * use GEP without "inbounds" (inside ac_build_pointer_add)
1164                  * to prevent incorrect code generation and hangs.
1165                  */
1166                 index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1167                 list = ac_build_pointer_add(&ctx->ac, list, index);
1168                 index = ctx->i32_0;
1169         }
1170
1171         if (target == TGSI_TEXTURE_BUFFER)
1172                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER);
1173         else
1174                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE);
1175
1176         if (samp_ptr)
1177                 *samp_ptr = NULL;
1178         if (fmask_ptr)
1179                 *fmask_ptr = NULL;
1180
1181         if (target == TGSI_TEXTURE_2D_MSAA ||
1182             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1183                 if (fmask_ptr)
1184                         *fmask_ptr = si_load_sampler_desc(ctx, list, index,
1185                                                           AC_DESC_FMASK);
1186         } else if (target != TGSI_TEXTURE_BUFFER) {
1187                 if (samp_ptr) {
1188                         *samp_ptr = si_load_sampler_desc(ctx, list, index,
1189                                                          AC_DESC_SAMPLER);
1190                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
1191                 }
1192         }
1193 }
1194
1195 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
1196  * incorrectly forces nearest filtering if the texture format is integer.
1197  * The only effect it has on Gather4, which always returns 4 texels for
1198  * bilinear filtering, is that the final coordinates are off by 0.5 of
1199  * the texel size.
1200  *
1201  * The workaround is to subtract 0.5 from the unnormalized coordinates,
1202  * or (0.5 / size) from the normalized coordinates.
1203  *
1204  * However, cube textures with 8_8_8_8 data formats require a different
1205  * workaround of overriding the num format to USCALED/SSCALED. This would lose
1206  * precision in 32-bit data formats, so it needs to be applied dynamically at
1207  * runtime. In this case, return an i1 value that indicates whether the
1208  * descriptor was overridden (and hence a fixup of the sampler result is needed).
1209  */
1210 static LLVMValueRef
1211 si_lower_gather4_integer(struct si_shader_context *ctx,
1212                          struct ac_image_args *args,
1213                          unsigned target,
1214                          enum tgsi_return_type return_type)
1215 {
1216         LLVMBuilderRef builder = ctx->ac.builder;
1217         LLVMValueRef wa_8888 = NULL;
1218         LLVMValueRef half_texel[2];
1219
1220         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1221                return_type == TGSI_RETURN_TYPE_UINT);
1222
1223         if (target == TGSI_TEXTURE_CUBE ||
1224             target == TGSI_TEXTURE_CUBE_ARRAY) {
1225                 LLVMValueRef formats;
1226                 LLVMValueRef data_format;
1227                 LLVMValueRef wa_formats;
1228
1229                 formats = LLVMBuildExtractElement(builder, args->resource, ctx->i32_1, "");
1230
1231                 data_format = LLVMBuildLShr(builder, formats,
1232                                             LLVMConstInt(ctx->i32, 20, false), "");
1233                 data_format = LLVMBuildAnd(builder, data_format,
1234                                            LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
1235                 wa_8888 = LLVMBuildICmp(
1236                         builder, LLVMIntEQ, data_format,
1237                         LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
1238                         "");
1239
1240                 uint32_t wa_num_format =
1241                         return_type == TGSI_RETURN_TYPE_UINT ?
1242                         S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_USCALED) :
1243                         S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_SSCALED);
1244                 wa_formats = LLVMBuildAnd(builder, formats,
1245                                           LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false),
1246                                           "");
1247                 wa_formats = LLVMBuildOr(builder, wa_formats,
1248                                         LLVMConstInt(ctx->i32, wa_num_format, false), "");
1249
1250                 formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, "");
1251                 args->resource = LLVMBuildInsertElement(
1252                         builder, args->resource, formats, ctx->i32_1, "");
1253         }
1254
1255         if (target == TGSI_TEXTURE_RECT ||
1256             target == TGSI_TEXTURE_SHADOWRECT) {
1257                 assert(!wa_8888);
1258                 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
1259         } else {
1260                 struct ac_image_args resinfo = {};
1261                 struct lp_build_if_state if_ctx;
1262
1263                 if (wa_8888) {
1264                         /* Skip the texture size query entirely if we don't need it. */
1265                         lp_build_if(&if_ctx, &ctx->gallivm, LLVMBuildNot(builder, wa_8888, ""));
1266                 }
1267
1268                 /* Query the texture size. */
1269                 resinfo.opcode = ac_image_get_resinfo;
1270                 resinfo.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1271                 resinfo.resource = args->resource;
1272                 resinfo.sampler = args->sampler;
1273                 resinfo.lod = ctx->ac.i32_0;
1274                 resinfo.dmask = 0xf;
1275
1276                 LLVMValueRef texsize =
1277                         fix_resinfo(ctx, target,
1278                                     ac_build_image_opcode(&ctx->ac, &resinfo));
1279
1280                 /* Compute -0.5 / size. */
1281                 for (unsigned c = 0; c < 2; c++) {
1282                         half_texel[c] =
1283                                 LLVMBuildExtractElement(builder, texsize,
1284                                                         LLVMConstInt(ctx->i32, c, 0), "");
1285                         half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
1286                         half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, half_texel[c]);
1287                         half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
1288                                                       LLVMConstReal(ctx->f32, -0.5), "");
1289                 }
1290
1291                 if (wa_8888) {
1292                         lp_build_endif(&if_ctx);
1293
1294                         LLVMBasicBlockRef bb[2] = { if_ctx.true_block, if_ctx.entry_block };
1295
1296                         for (unsigned c = 0; c < 2; c++) {
1297                                 LLVMValueRef values[2] = { half_texel[c], ctx->ac.f32_0 };
1298                                 half_texel[c] = ac_build_phi(&ctx->ac, ctx->f32, 2,
1299                                                              values, bb);
1300                         }
1301                 }
1302         }
1303
1304         for (unsigned c = 0; c < 2; c++) {
1305                 LLVMValueRef tmp;
1306                 tmp = ac_to_float(&ctx->ac, args->coords[c]);
1307                 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
1308                 args->coords[c] = ac_to_integer(&ctx->ac, tmp);
1309         }
1310
1311         return wa_8888;
1312 }
1313
1314 /* The second half of the cube texture 8_8_8_8 integer workaround: adjust the
1315  * result after the gather operation.
1316  */
1317 static LLVMValueRef
1318 si_fix_gather4_integer_result(struct si_shader_context *ctx,
1319                            LLVMValueRef result,
1320                            enum tgsi_return_type return_type,
1321                            LLVMValueRef wa)
1322 {
1323         LLVMBuilderRef builder = ctx->ac.builder;
1324
1325         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1326                return_type == TGSI_RETURN_TYPE_UINT);
1327
1328         for (unsigned chan = 0; chan < 4; ++chan) {
1329                 LLVMValueRef chanv = LLVMConstInt(ctx->i32, chan, false);
1330                 LLVMValueRef value;
1331                 LLVMValueRef wa_value;
1332
1333                 value = LLVMBuildExtractElement(builder, result, chanv, "");
1334
1335                 if (return_type == TGSI_RETURN_TYPE_UINT)
1336                         wa_value = LLVMBuildFPToUI(builder, value, ctx->i32, "");
1337                 else
1338                         wa_value = LLVMBuildFPToSI(builder, value, ctx->i32, "");
1339                 wa_value = ac_to_float(&ctx->ac, wa_value);
1340                 value = LLVMBuildSelect(builder, wa, wa_value, value, "");
1341
1342                 result = LLVMBuildInsertElement(builder, result, value, chanv, "");
1343         }
1344
1345         return result;
1346 }
1347
1348 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
1349                                 struct lp_build_tgsi_context *bld_base,
1350                                 struct lp_build_emit_data *emit_data)
1351 {
1352         struct si_shader_context *ctx = si_shader_context(bld_base);
1353         const struct tgsi_full_instruction *inst = emit_data->inst;
1354         unsigned opcode = inst->Instruction.Opcode;
1355         unsigned target = inst->Texture.Texture;
1356         struct ac_image_args args = {};
1357         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
1358         unsigned chan;
1359         bool has_offset = inst->Texture.NumOffsets > 0;
1360         LLVMValueRef fmask_ptr = NULL;
1361
1362         tex_fetch_ptrs(bld_base, emit_data, &args.resource, &args.sampler, &fmask_ptr);
1363
1364         if (target == TGSI_TEXTURE_BUFFER) {
1365                 LLVMValueRef vindex = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1366                 unsigned num_channels =
1367                         util_last_bit(inst->Dst[0].Register.WriteMask);
1368                 LLVMValueRef result =
1369                         ac_build_buffer_load_format(&ctx->ac,
1370                                                     args.resource,
1371                                                     vindex,
1372                                                     ctx->i32_0,
1373                                                     num_channels, false, true);
1374                 emit_data->output[emit_data->chan] =
1375                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
1376                 return;
1377         }
1378
1379         /* Fetch and project texture coordinates */
1380         args.coords[3] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_W);
1381         for (chan = 0; chan < 3; chan++) {
1382                 args.coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
1383                 if (opcode == TGSI_OPCODE_TXP)
1384                         args.coords[chan] = ac_build_fdiv(&ctx->ac,
1385                                 args.coords[chan], args.coords[3]);
1386         }
1387
1388         if (opcode == TGSI_OPCODE_TXP)
1389                 args.coords[3] = ctx->ac.f32_1;
1390
1391         /* Pack offsets. */
1392         if (has_offset &&
1393             opcode != TGSI_OPCODE_TXF &&
1394             opcode != TGSI_OPCODE_TXF_LZ) {
1395                 /* The offsets are six-bit signed integers packed like this:
1396                  *   X=[5:0], Y=[13:8], and Z=[21:16].
1397                  */
1398                 LLVMValueRef offset[3], pack;
1399
1400                 assert(inst->Texture.NumOffsets == 1);
1401
1402                 for (chan = 0; chan < 3; chan++) {
1403                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base, inst, 0, chan);
1404                         offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
1405                                                     LLVMConstInt(ctx->i32, 0x3f, 0), "");
1406                         if (chan)
1407                                 offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
1408                                                             LLVMConstInt(ctx->i32, chan*8, 0), "");
1409                 }
1410
1411                 pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
1412                 pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
1413                 args.offset = pack;
1414         }
1415
1416         /* Pack LOD bias value */
1417         if (opcode == TGSI_OPCODE_TXB)
1418                 args.bias = args.coords[3];
1419         if (opcode == TGSI_OPCODE_TXB2)
1420                 args.bias = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1421
1422         /* Pack depth comparison value */
1423         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
1424                 LLVMValueRef z;
1425
1426                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1427                         z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1428                 } else {
1429                         assert(ref_pos >= 0);
1430                         z = args.coords[ref_pos];
1431                 }
1432
1433                 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
1434                  * OpenGL 4.5 spec says:
1435                  *
1436                  *    "If the texture’s internal format indicates a fixed-point
1437                  *     depth texture, then D_t and D_ref are clamped to the
1438                  *     range [0, 1]; otherwise no clamping is performed."
1439                  *
1440                  * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
1441                  * so the depth comparison value isn't clamped for Z16 and
1442                  * Z24 anymore. Do it manually here.
1443                  */
1444                 if (ctx->screen->info.chip_class >= VI) {
1445                         LLVMValueRef upgraded;
1446                         LLVMValueRef clamped;
1447                         upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
1448                                                            LLVMConstInt(ctx->i32, 3, false), "");
1449                         upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded,
1450                                                  LLVMConstInt(ctx->i32, 29, false), "");
1451                         upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->i1, "");
1452                         clamped = ac_build_clamp(&ctx->ac, z);
1453                         z = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, z, "");
1454                 }
1455
1456                 args.compare = z;
1457         }
1458
1459         /* Pack user derivatives */
1460         if (opcode == TGSI_OPCODE_TXD) {
1461                 int param, num_src_deriv_channels, num_dst_deriv_channels;
1462
1463                 switch (target) {
1464                 case TGSI_TEXTURE_3D:
1465                         num_src_deriv_channels = 3;
1466                         num_dst_deriv_channels = 3;
1467                         break;
1468                 case TGSI_TEXTURE_2D:
1469                 case TGSI_TEXTURE_SHADOW2D:
1470                 case TGSI_TEXTURE_RECT:
1471                 case TGSI_TEXTURE_SHADOWRECT:
1472                 case TGSI_TEXTURE_2D_ARRAY:
1473                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
1474                         num_src_deriv_channels = 2;
1475                         num_dst_deriv_channels = 2;
1476                         break;
1477                 case TGSI_TEXTURE_CUBE:
1478                 case TGSI_TEXTURE_SHADOWCUBE:
1479                 case TGSI_TEXTURE_CUBE_ARRAY:
1480                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
1481                         /* Cube derivatives will be converted to 2D. */
1482                         num_src_deriv_channels = 3;
1483                         num_dst_deriv_channels = 3;
1484                         break;
1485                 case TGSI_TEXTURE_1D:
1486                 case TGSI_TEXTURE_SHADOW1D:
1487                 case TGSI_TEXTURE_1D_ARRAY:
1488                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
1489                         num_src_deriv_channels = 1;
1490
1491                         /* 1D textures are allocated and used as 2D on GFX9. */
1492                         if (ctx->screen->info.chip_class >= GFX9) {
1493                                 num_dst_deriv_channels = 2;
1494                         } else {
1495                                 num_dst_deriv_channels = 1;
1496                         }
1497                         break;
1498                 default:
1499                         unreachable("invalid target");
1500                 }
1501
1502                 for (param = 0; param < 2; param++) {
1503                         for (chan = 0; chan < num_src_deriv_channels; chan++)
1504                                 args.derivs[param * num_dst_deriv_channels + chan] =
1505                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
1506
1507                         /* Fill in the rest with zeros. */
1508                         for (chan = num_src_deriv_channels;
1509                              chan < num_dst_deriv_channels; chan++)
1510                                 args.derivs[param * num_dst_deriv_channels + chan] =
1511                                         ctx->ac.f32_0;
1512                 }
1513         }
1514
1515         if (target == TGSI_TEXTURE_CUBE ||
1516             target == TGSI_TEXTURE_CUBE_ARRAY ||
1517             target == TGSI_TEXTURE_SHADOWCUBE ||
1518             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1519                 ac_prepare_cube_coords(&ctx->ac,
1520                                        opcode == TGSI_OPCODE_TXD,
1521                                        target == TGSI_TEXTURE_CUBE_ARRAY ||
1522                                        target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
1523                                        opcode == TGSI_OPCODE_LODQ,
1524                                        args.coords, args.derivs);
1525         } else if (tgsi_is_array_sampler(target) &&
1526                    opcode != TGSI_OPCODE_TXF &&
1527                    opcode != TGSI_OPCODE_TXF_LZ &&
1528                    ctx->screen->info.chip_class <= VI) {
1529                 unsigned array_coord = target == TGSI_TEXTURE_1D_ARRAY ? 1 : 2;
1530                 args.coords[array_coord] = ac_build_round(&ctx->ac, args.coords[array_coord]);
1531         }
1532
1533         /* 1D textures are allocated and used as 2D on GFX9. */
1534         if (ctx->screen->info.chip_class >= GFX9) {
1535                 LLVMValueRef filler;
1536
1537                 /* Use 0.5, so that we don't sample the border color. */
1538                 if (opcode == TGSI_OPCODE_TXF ||
1539                     opcode == TGSI_OPCODE_TXF_LZ)
1540                         filler = ctx->i32_0;
1541                 else
1542                         filler = LLVMConstReal(ctx->f32, 0.5);
1543
1544                 if (target == TGSI_TEXTURE_1D ||
1545                     target == TGSI_TEXTURE_SHADOW1D) {
1546                         args.coords[1] = filler;
1547                 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
1548                            target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
1549                         args.coords[2] = args.coords[1];
1550                         args.coords[1] = filler;
1551                 }
1552         }
1553
1554         /* Pack LOD or sample index */
1555         if (opcode == TGSI_OPCODE_TXL)
1556                 args.lod = args.coords[3];
1557         else if (opcode == TGSI_OPCODE_TXL2)
1558                 args.lod = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1559         else if (opcode == TGSI_OPCODE_TXF) {
1560                 if (target == TGSI_TEXTURE_2D_MSAA) {
1561                         /* No LOD, but move sample index into the right place. */
1562                         args.coords[2] = args.coords[3];
1563                 } else if (target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
1564                         args.lod = args.coords[3];
1565                 }
1566         }
1567
1568         if (target == TGSI_TEXTURE_2D_MSAA ||
1569             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1570                 ac_apply_fmask_to_sample(&ctx->ac, fmask_ptr, args.coords,
1571                                          target == TGSI_TEXTURE_2D_ARRAY_MSAA);
1572         }
1573
1574         if (opcode == TGSI_OPCODE_TXF ||
1575             opcode == TGSI_OPCODE_TXF_LZ) {
1576                 /* add tex offsets */
1577                 if (inst->Texture.NumOffsets) {
1578                         const struct tgsi_texture_offset *off = inst->TexOffsets;
1579
1580                         assert(inst->Texture.NumOffsets == 1);
1581
1582                         switch (target) {
1583                         case TGSI_TEXTURE_3D:
1584                                 args.coords[2] =
1585                                         LLVMBuildAdd(ctx->ac.builder, args.coords[2],
1586                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ], "");
1587                                 /* fall through */
1588                         case TGSI_TEXTURE_2D:
1589                         case TGSI_TEXTURE_SHADOW2D:
1590                         case TGSI_TEXTURE_RECT:
1591                         case TGSI_TEXTURE_SHADOWRECT:
1592                         case TGSI_TEXTURE_2D_ARRAY:
1593                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
1594                                 args.coords[1] =
1595                                         LLVMBuildAdd(ctx->ac.builder, args.coords[1],
1596                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY], "");
1597                                 /* fall through */
1598                         case TGSI_TEXTURE_1D:
1599                         case TGSI_TEXTURE_SHADOW1D:
1600                         case TGSI_TEXTURE_1D_ARRAY:
1601                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
1602                                 args.coords[0] =
1603                                         LLVMBuildAdd(ctx->ac.builder, args.coords[0],
1604                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX], "");
1605                                 break;
1606                                 /* texture offsets do not apply to other texture targets */
1607                         }
1608                 }
1609         }
1610
1611         if (opcode == TGSI_OPCODE_TG4) {
1612                 unsigned gather_comp = 0;
1613
1614                 /* DMASK was repurposed for GATHER4. 4 components are always
1615                  * returned and DMASK works like a swizzle - it selects
1616                  * the component to fetch. The only valid DMASK values are
1617                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
1618                  * (red,red,red,red) etc.) The ISA document doesn't mention
1619                  * this.
1620                  */
1621
1622                 /* Get the component index from src1.x for Gather4. */
1623                 if (!tgsi_is_shadow_target(target)) {
1624                         LLVMValueRef comp_imm;
1625                         struct tgsi_src_register src1 = inst->Src[1].Register;
1626
1627                         assert(src1.File == TGSI_FILE_IMMEDIATE);
1628
1629                         comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
1630                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
1631                         gather_comp = CLAMP(gather_comp, 0, 3);
1632                 }
1633
1634                 args.dmask = 1 << gather_comp;
1635         } else {
1636                 args.dmask = 0xf;
1637         }
1638
1639         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1640         args.unorm = target == TGSI_TEXTURE_RECT ||
1641                      target == TGSI_TEXTURE_SHADOWRECT;
1642         args.opcode = ac_image_sample;
1643
1644         switch (opcode) {
1645         case TGSI_OPCODE_TXF:
1646         case TGSI_OPCODE_TXF_LZ:
1647                 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
1648                               target == TGSI_TEXTURE_2D_MSAA ||
1649                               target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
1650                                       ac_image_load : ac_image_load_mip;
1651                 break;
1652         case TGSI_OPCODE_LODQ:
1653                 args.opcode = ac_image_get_lod;
1654                 break;
1655         case TGSI_OPCODE_TEX:
1656         case TGSI_OPCODE_TEX2:
1657         case TGSI_OPCODE_TXP:
1658                 if (ctx->type != PIPE_SHADER_FRAGMENT)
1659                         args.level_zero = true;
1660                 break;
1661         case TGSI_OPCODE_TEX_LZ:
1662                 args.level_zero = true;
1663                 break;
1664         case TGSI_OPCODE_TXB:
1665         case TGSI_OPCODE_TXB2:
1666                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
1667                 break;
1668         case TGSI_OPCODE_TXL:
1669         case TGSI_OPCODE_TXL2:
1670                 break;
1671         case TGSI_OPCODE_TXD:
1672                 break;
1673         case TGSI_OPCODE_TG4:
1674                 args.opcode = ac_image_gather4;
1675                 args.level_zero = true;
1676                 break;
1677         default:
1678                 assert(0);
1679                 return;
1680         }
1681
1682         /* The hardware needs special lowering for Gather4 with integer formats. */
1683         LLVMValueRef gather4_int_result_workaround = NULL;
1684
1685         if (ctx->screen->info.chip_class <= VI &&
1686             opcode == TGSI_OPCODE_TG4) {
1687                 assert(inst->Texture.ReturnType != TGSI_RETURN_TYPE_UNKNOWN);
1688
1689                 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
1690                     inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
1691                         gather4_int_result_workaround =
1692                                 si_lower_gather4_integer(ctx, &args, target,
1693                                                          inst->Texture.ReturnType);
1694                 }
1695         }
1696
1697         args.attributes = AC_FUNC_ATTR_READNONE;
1698         LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
1699
1700         if (gather4_int_result_workaround) {
1701                 result = si_fix_gather4_integer_result(ctx, result,
1702                                                        inst->Texture.ReturnType,
1703                                                        gather4_int_result_workaround);
1704         }
1705
1706         emit_data->output[emit_data->chan] = result;
1707 }
1708
1709 static void si_llvm_emit_txqs(
1710         const struct lp_build_tgsi_action *action,
1711         struct lp_build_tgsi_context *bld_base,
1712         struct lp_build_emit_data *emit_data)
1713 {
1714         struct si_shader_context *ctx = si_shader_context(bld_base);
1715         LLVMValueRef res, samples;
1716         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
1717
1718         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
1719
1720         /* Read the samples from the descriptor directly. */
1721         res = LLVMBuildBitCast(ctx->ac.builder, res_ptr, ctx->v8i32, "");
1722         samples = LLVMBuildExtractElement(ctx->ac.builder, res,
1723                                           LLVMConstInt(ctx->i32, 3, 0), "");
1724         samples = LLVMBuildLShr(ctx->ac.builder, samples,
1725                                 LLVMConstInt(ctx->i32, 16, 0), "");
1726         samples = LLVMBuildAnd(ctx->ac.builder, samples,
1727                                LLVMConstInt(ctx->i32, 0xf, 0), "");
1728         samples = LLVMBuildShl(ctx->ac.builder, ctx->i32_1,
1729                                samples, "");
1730
1731         emit_data->output[emit_data->chan] = samples;
1732 }
1733
1734 static void si_llvm_emit_fbfetch(const struct lp_build_tgsi_action *action,
1735                                  struct lp_build_tgsi_context *bld_base,
1736                                  struct lp_build_emit_data *emit_data)
1737 {
1738         struct si_shader_context *ctx = si_shader_context(bld_base);
1739         struct ac_image_args args = {};
1740         LLVMValueRef ptr, image, fmask;
1741
1742         /* Ignore src0, because KHR_blend_func_extended disallows multiple render
1743          * targets.
1744          */
1745
1746         /* Load the image descriptor. */
1747         STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
1748         ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1749         ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
1750                                    ac_array_in_const32_addr_space(ctx->v8i32), "");
1751         image = ac_build_load_to_sgpr(&ctx->ac, ptr,
1752                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
1753
1754         unsigned chan = 0;
1755
1756         args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 0, 16);
1757
1758         if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1759                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 16, 16);
1760
1761         /* Get the current render target layer index. */
1762         if (ctx->shader->key.mono.u.ps.fbfetch_layered)
1763                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_ANCILLARY, 16, 11);
1764
1765         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1766                 args.coords[chan++] = si_get_sample_id(ctx);
1767
1768         if (ctx->shader->key.mono.u.ps.fbfetch_msaa) {
1769                 fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
1770                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
1771
1772                 ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
1773                                          ctx->shader->key.mono.u.ps.fbfetch_layered);
1774         }
1775
1776         args.opcode = ac_image_load;
1777         args.resource = image;
1778         args.dmask = 0xf;
1779         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1780                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1781                         ac_image_2darraymsaa : ac_image_2dmsaa;
1782         else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1783                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1784                         ac_image_1darray : ac_image_1d;
1785         else
1786                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1787                         ac_image_2darray : ac_image_2d;
1788
1789         emit_data->output[emit_data->chan] =
1790                 ac_build_image_opcode(&ctx->ac, &args);
1791 }
1792
1793 /**
1794  * Setup actions for TGSI memory opcode, including texture opcodes.
1795  */
1796 void si_shader_context_init_mem(struct si_shader_context *ctx)
1797 {
1798         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1799
1800         bld_base->op_actions[TGSI_OPCODE_TEX].emit = build_tex_intrinsic;
1801         bld_base->op_actions[TGSI_OPCODE_TEX_LZ].emit = build_tex_intrinsic;
1802         bld_base->op_actions[TGSI_OPCODE_TEX2].emit = build_tex_intrinsic;
1803         bld_base->op_actions[TGSI_OPCODE_TXB].emit = build_tex_intrinsic;
1804         bld_base->op_actions[TGSI_OPCODE_TXB2].emit = build_tex_intrinsic;
1805         bld_base->op_actions[TGSI_OPCODE_TXD].emit = build_tex_intrinsic;
1806         bld_base->op_actions[TGSI_OPCODE_TXF].emit = build_tex_intrinsic;
1807         bld_base->op_actions[TGSI_OPCODE_TXF_LZ].emit = build_tex_intrinsic;
1808         bld_base->op_actions[TGSI_OPCODE_TXL].emit = build_tex_intrinsic;
1809         bld_base->op_actions[TGSI_OPCODE_TXL2].emit = build_tex_intrinsic;
1810         bld_base->op_actions[TGSI_OPCODE_TXP].emit = build_tex_intrinsic;
1811         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = resq_emit;
1812         bld_base->op_actions[TGSI_OPCODE_TG4].emit = build_tex_intrinsic;
1813         bld_base->op_actions[TGSI_OPCODE_LODQ].emit = build_tex_intrinsic;
1814         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
1815
1816         bld_base->op_actions[TGSI_OPCODE_FBFETCH].emit = si_llvm_emit_fbfetch;
1817
1818         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
1819         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
1820         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
1821
1822         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit;
1823         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
1824         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit;
1825         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
1826         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit;
1827         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
1828         bld_base->op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit;
1829         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
1830         bld_base->op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit;
1831         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
1832         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit;
1833         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
1834         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit;
1835         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
1836         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit;
1837         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
1838         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
1839         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
1840         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
1841         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
1842 }