src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c

   1 /*
   2  * Copyright 2017 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include <llvm/Config/llvm-config.h>
  26
  27 #include "si_shader_internal.h"
  28 #include "si_pipe.h"
  29 #include "sid.h"
  30 #include "tgsi/tgsi_build.h"
  31 #include "tgsi/tgsi_util.h"
  32 #include "ac_llvm_util.h"
  33
  34 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
  35                            struct lp_build_emit_data *emit_data,
  36                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
  37                            LLVMValueRef *fmask_ptr);
  38
  39 /**
  40  * Given a v8i32 resource descriptor for a buffer, extract the size of the
  41  * buffer in number of elements and return it as an i32.
  42  */
  43 static LLVMValueRef get_buffer_size(
  44         struct lp_build_tgsi_context *bld_base,
  45         LLVMValueRef descriptor)
  46 {
  47         struct si_shader_context *ctx = si_shader_context(bld_base);
  48         LLVMBuilderRef builder = ctx->ac.builder;
  49         LLVMValueRef size =
  50                 LLVMBuildExtractElement(builder, descriptor,
  51                                         LLVMConstInt(ctx->i32, 2, 0), "");
  52
  53         if (ctx->screen->info.chip_class == GFX8) {
  54                 /* On GFX8, the descriptor contains the size in bytes,
  55                  * but TXQ must return the size in elements.
  56                  * The stride is always non-zero for resources using TXQ.
  57                  */
  58                 LLVMValueRef stride =
  59                         LLVMBuildExtractElement(builder, descriptor,
  60                                                 ctx->i32_1, "");
  61                 stride = LLVMBuildLShr(builder, stride,
  62                                        LLVMConstInt(ctx->i32, 16, 0), "");
  63                 stride = LLVMBuildAnd(builder, stride,
  64                                       LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
  65
  66                 size = LLVMBuildUDiv(builder, size, stride, "");
  67         }
  68
  69         return size;
  70 }
  71
  72 static LLVMValueRef
  73 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
  74                          const struct tgsi_full_src_register *reg,
  75                          bool ubo)
  76 {
  77         LLVMValueRef index;
  78
  79         if (!reg->Register.Indirect) {
  80                 index = LLVMConstInt(ctx->i32, reg->Register.Index, false);
  81         } else {
  82                 index = si_get_indirect_index(ctx, &reg->Indirect,
  83                                               1, reg->Register.Index);
  84         }
  85
  86         if (ubo)
  87                 return ctx->abi.load_ubo(&ctx->abi, index);
  88         else
  89                 return ctx->abi.load_ssbo(&ctx->abi, index, false);
  90 }
  91
  92 static enum ac_image_dim
  93 ac_texture_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
  94 {
  95         switch (target) {
  96         case TGSI_TEXTURE_1D:
  97         case TGSI_TEXTURE_SHADOW1D:
  98                 if (screen->info.chip_class == GFX9)
  99                         return ac_image_2d;
 100                 return ac_image_1d;
 101         case TGSI_TEXTURE_2D:
 102         case TGSI_TEXTURE_SHADOW2D:
 103         case TGSI_TEXTURE_RECT:
 104         case TGSI_TEXTURE_SHADOWRECT:
 105                 return ac_image_2d;
 106         case TGSI_TEXTURE_3D:
 107                 return ac_image_3d;
 108         case TGSI_TEXTURE_CUBE:
 109         case TGSI_TEXTURE_SHADOWCUBE:
 110         case TGSI_TEXTURE_CUBE_ARRAY:
 111         case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
 112                 return ac_image_cube;
 113         case TGSI_TEXTURE_1D_ARRAY:
 114         case TGSI_TEXTURE_SHADOW1D_ARRAY:
 115                 if (screen->info.chip_class == GFX9)
 116                         return ac_image_2darray;
 117                 return ac_image_1darray;
 118         case TGSI_TEXTURE_2D_ARRAY:
 119         case TGSI_TEXTURE_SHADOW2D_ARRAY:
 120                 return ac_image_2darray;
 121         case TGSI_TEXTURE_2D_MSAA:
 122                 return ac_image_2dmsaa;
 123         case TGSI_TEXTURE_2D_ARRAY_MSAA:
 124                 return ac_image_2darraymsaa;
 125         default:
 126                 unreachable("unhandled texture type");
 127         }
 128 }
 129
 130 static enum ac_image_dim
 131 ac_image_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
 132 {
 133         enum ac_image_dim dim = ac_texture_dim_from_tgsi_target(screen, target);
 134
 135         /* Match the resource type set in the descriptor. */
 136         if (dim == ac_image_cube ||
 137             (screen->info.chip_class <= GFX8 && dim == ac_image_3d))
 138                 dim = ac_image_2darray;
 139         else if (target == TGSI_TEXTURE_2D && screen->info.chip_class == GFX9) {
 140                 /* When a single layer of a 3D texture is bound, the shader
 141                  * will refer to a 2D target, but the descriptor has a 3D type.
 142                  * Since the HW ignores BASE_ARRAY in this case, we need to
 143                  * send 3 coordinates. This doesn't hurt when the underlying
 144                  * texture is non-3D.
 145                  */
 146                 dim = ac_image_3d;
 147         }
 148
 149         return dim;
 150 }
 151
 152 /**
 153  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
 154  *
 155  * At least on Tonga, executing image stores on images with DCC enabled and
 156  * non-trivial can eventually lead to lockups. This can occur when an
 157  * application binds an image as read-only but then uses a shader that writes
 158  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
 159  * program termination) in this case, but it doesn't cost much to be a bit
 160  * nicer: disabling DCC in the shader still leads to undefined results but
 161  * avoids the lockup.
 162  */
 163 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
 164                                   LLVMValueRef rsrc)
 165 {
 166         if (ctx->screen->info.chip_class <= GFX7) {
 167                 return rsrc;
 168         } else {
 169                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
 170                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
 171                 LLVMValueRef tmp;
 172
 173                 tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
 174                 tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
 175                 return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
 176         }
 177 }
 178
 179 LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
 180                                 LLVMValueRef list, LLVMValueRef index,
 181                                 enum ac_descriptor_type desc_type,
 182                                 bool uses_store, bool bindless)
 183 {
 184         LLVMBuilderRef builder = ctx->ac.builder;
 185         LLVMValueRef rsrc;
 186
 187         if (desc_type == AC_DESC_BUFFER) {
 188                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
 189                                       ctx->i32_1);
 190                 list = LLVMBuildPointerCast(builder, list,
 191                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
 192         } else {
 193                 assert(desc_type == AC_DESC_IMAGE);
 194         }
 195
 196         if (bindless)
 197                 rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
 198         else
 199                 rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
 200
 201         if (desc_type == AC_DESC_IMAGE && uses_store)
 202                 rsrc = force_dcc_off(ctx, rsrc);
 203         return rsrc;
 204 }
 205
 206 /**
 207  * Load the resource descriptor for \p image.
 208  */
 209 static void
 210 image_fetch_rsrc(
 211         struct lp_build_tgsi_context *bld_base,
 212         const struct tgsi_full_src_register *image,
 213         bool is_store, unsigned target,
 214         LLVMValueRef *rsrc)
 215 {
 216         struct si_shader_context *ctx = si_shader_context(bld_base);
 217         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
 218                                              ctx->param_samplers_and_images);
 219         LLVMValueRef index;
 220
 221         if (!image->Register.Indirect) {
 222                 index = LLVMConstInt(ctx->i32,
 223                                      si_get_image_slot(image->Register.Index), 0);
 224         } else {
 225                 /* From the GL_ARB_shader_image_load_store extension spec:
 226                  *
 227                  *    If a shader performs an image load, store, or atomic
 228                  *    operation using an image variable declared as an array,
 229                  *    and if the index used to select an individual element is
 230                  *    negative or greater than or equal to the size of the
 231                  *    array, the results of the operation are undefined but may
 232                  *    not lead to termination.
 233                  */
 234                 index = si_get_bounded_indirect_index(ctx, &image->Indirect,
 235                                                       image->Register.Index,
 236                                                       ctx->num_images);
 237                 index = LLVMBuildSub(ctx->ac.builder,
 238                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGE_SLOTS - 1, 0),
 239                                      index, "");
 240         }
 241
 242         bool bindless = false;
 243
 244         if (image->Register.File != TGSI_FILE_IMAGE) {
 245                 /* Bindless descriptors are accessible from a different pair of
 246                  * user SGPR indices.
 247                  */
 248                 rsrc_ptr = LLVMGetParam(ctx->main_fn,
 249                                         ctx->param_bindless_samplers_and_images);
 250                 index = lp_build_emit_fetch_src(bld_base, image,
 251                                                 TGSI_TYPE_UNSIGNED, 0);
 252
 253                 /* For simplicity, bindless image descriptors use fixed
 254                  * 16-dword slots for now.
 255                  */
 256                 index = LLVMBuildMul(ctx->ac.builder, index,
 257                                      LLVMConstInt(ctx->i32, 2, 0), "");
 258                 bindless = true;
 259         }
 260
 261         *rsrc = si_load_image_desc(ctx, rsrc_ptr, index,
 262                                    target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE,
 263                                    is_store, bindless);
 264 }
 265
 266 static void image_fetch_coords(
 267                 struct lp_build_tgsi_context *bld_base,
 268                 const struct tgsi_full_instruction *inst,
 269                 unsigned src, LLVMValueRef desc,
 270                 LLVMValueRef *coords)
 271 {
 272         struct si_shader_context *ctx = si_shader_context(bld_base);
 273         LLVMBuilderRef builder = ctx->ac.builder;
 274         unsigned target = inst->Memory.Texture;
 275         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
 276         LLVMValueRef tmp;
 277         int chan;
 278
 279         for (chan = 0; chan < num_coords; ++chan) {
 280                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
 281                 tmp = ac_to_integer(&ctx->ac, tmp);
 282                 coords[chan] = tmp;
 283         }
 284
 285         if (target == TGSI_TEXTURE_2D_MSAA ||
 286             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
 287                 /* Need the sample index as well. */
 288                 tmp = lp_build_emit_fetch(bld_base, inst, src, TGSI_SWIZZLE_W);
 289                 coords[chan] = ac_to_integer(&ctx->ac, tmp);
 290         }
 291
 292         if (ctx->screen->info.chip_class == GFX9) {
 293                 /* 1D textures are allocated and used as 2D on GFX9. */
 294                 if (target == TGSI_TEXTURE_1D) {
 295                         coords[1] = ctx->i32_0;
 296                 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
 297                         coords[2] = coords[1];
 298                         coords[1] = ctx->i32_0;
 299                 } else if (target == TGSI_TEXTURE_2D) {
 300                         /* The hw can't bind a slice of a 3D image as a 2D
 301                          * image, because it ignores BASE_ARRAY if the target
 302                          * is 3D. The workaround is to read BASE_ARRAY and set
 303                          * it as the 3rd address operand for all 2D images.
 304                          */
 305                         LLVMValueRef first_layer, const5, mask;
 306
 307                         const5 = LLVMConstInt(ctx->i32, 5, 0);
 308                         mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
 309                         first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
 310                         first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
 311
 312                         coords[2] = first_layer;
 313                 }
 314         }
 315 }
 316
 317 static unsigned get_cache_policy(struct si_shader_context *ctx,
 318                                  const struct tgsi_full_instruction *inst,
 319                                  bool atomic, bool may_store_unaligned,
 320                                  bool writeonly_memory)
 321 {
 322         unsigned cache_policy = 0;
 323
 324         if (!atomic &&
 325             /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores.
 326              * All store opcodes not aligned to a dword are affected.
 327              * The only way to get unaligned stores in radeonsi is through
 328              * shader images. */
 329             ((may_store_unaligned && ctx->screen->info.chip_class == GFX6) ||
 330              /* If this is write-only, don't keep data in L1 to prevent
 331               * evicting L1 cache lines that may be needed by other
 332               * instructions. */
 333              writeonly_memory ||
 334              inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))) {
 335                 cache_policy |= ac_glc;
 336         }
 337
 338         if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY)
 339                 cache_policy |= ac_slc;
 340
 341         return cache_policy;
 342 }
 343
 344 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
 345                                    const struct tgsi_full_instruction *inst,
 346                                    LLVMTypeRef type, int arg)
 347 {
 348         LLVMBuilderRef builder = ctx->ac.builder;
 349         LLVMValueRef offset, ptr;
 350         int addr_space;
 351
 352         offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
 353         offset = ac_to_integer(&ctx->ac, offset);
 354
 355         ptr = ctx->ac.lds;
 356         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
 357         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
 358         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
 359
 360         return ptr;
 361 }
 362
 363 static void load_emit_memory(
 364                 struct si_shader_context *ctx,
 365                 struct lp_build_emit_data *emit_data)
 366 {
 367         const struct tgsi_full_instruction *inst = emit_data->inst;
 368         unsigned writemask = inst->Dst[0].Register.WriteMask;
 369         LLVMValueRef channels[4], ptr, derived_ptr, index;
 370         int chan;
 371
 372         ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
 373
 374         for (chan = 0; chan < 4; ++chan) {
 375                 if (!(writemask & (1 << chan))) {
 376                         channels[chan] = LLVMGetUndef(ctx->f32);
 377                         continue;
 378                 }
 379
 380                 index = LLVMConstInt(ctx->i32, chan, 0);
 381                 derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
 382                 channels[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
 383         }
 384         emit_data->output[emit_data->chan] = ac_build_gather_values(&ctx->ac, channels, 4);
 385 }
 386
 387 /**
 388  * Return true if the memory accessed by a LOAD or STORE instruction is
 389  * read-only or write-only, respectively.
 390  *
 391  * \param shader_buffers_reverse_access_mask
 392  *      For LOAD, set this to (store | atomic) slot usage in the shader.
 393  *      For STORE, set this to (load | atomic) slot usage in the shader.
 394  * \param images_reverse_access_mask  Same as above, but for images.
 395  * \param bindless_buffer_reverse_access_mask  Same as above, but for bindless image buffers.
 396  * \param bindless_image_reverse_access_mask   Same as above, but for bindless images.
 397  */
 398 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
 399                                   const struct tgsi_shader_info *info,
 400                                   unsigned shader_buffers_reverse_access_mask,
 401                                   unsigned images_reverse_access_mask,
 402                                   bool bindless_buffer_reverse_access_mask,
 403                                   bool bindless_image_reverse_access_mask)
 404 {
 405         enum tgsi_file_type resource_file;
 406         unsigned resource_index;
 407         bool resource_indirect;
 408
 409         if (inst->Instruction.Opcode == TGSI_OPCODE_STORE) {
 410                 resource_file = inst->Dst[0].Register.File;
 411                 resource_index = inst->Dst[0].Register.Index;
 412                 resource_indirect = inst->Dst[0].Register.Indirect;
 413         } else {
 414                 resource_file = inst->Src[0].Register.File;
 415                 resource_index = inst->Src[0].Register.Index;
 416                 resource_indirect = inst->Src[0].Register.Indirect;
 417         }
 418
 419         assert(resource_file == TGSI_FILE_BUFFER ||
 420                resource_file == TGSI_FILE_IMAGE ||
 421                /* bindless image */
 422                resource_file == TGSI_FILE_INPUT ||
 423                resource_file == TGSI_FILE_OUTPUT ||
 424                resource_file == TGSI_FILE_CONSTANT ||
 425                resource_file == TGSI_FILE_TEMPORARY ||
 426                resource_file == TGSI_FILE_IMMEDIATE);
 427
 428         assert(resource_file != TGSI_FILE_BUFFER ||
 429                inst->Memory.Texture == TGSI_TEXTURE_BUFFER);
 430
 431         bool bindless = resource_file != TGSI_FILE_BUFFER &&
 432                         resource_file != TGSI_FILE_IMAGE;
 433
 434         /* RESTRICT means NOALIAS.
 435          * If there are no writes, we can assume the accessed memory is read-only.
 436          * If there are no reads, we can assume the accessed memory is write-only.
 437          */
 438         if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT && !bindless) {
 439                 unsigned reverse_access_mask;
 440
 441                 if (resource_file == TGSI_FILE_BUFFER) {
 442                         reverse_access_mask = shader_buffers_reverse_access_mask;
 443                 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 444                         reverse_access_mask = info->images_buffers &
 445                                               images_reverse_access_mask;
 446                 } else {
 447                         reverse_access_mask = ~info->images_buffers &
 448                                               images_reverse_access_mask;
 449                 }
 450
 451                 if (resource_indirect) {
 452                         if (!reverse_access_mask)
 453                                 return true;
 454                 } else {
 455                         if (!(reverse_access_mask &
 456                               (1u << resource_index)))
 457                                 return true;
 458                 }
 459         }
 460
 461         /* If there are no buffer writes (for both shader buffers & image
 462          * buffers), it implies that buffer memory is read-only.
 463          * If there are no buffer reads (for both shader buffers & image
 464          * buffers), it implies that buffer memory is write-only.
 465          *
 466          * Same for the case when there are no writes/reads for non-buffer
 467          * images.
 468          */
 469         if (resource_file == TGSI_FILE_BUFFER ||
 470             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 471                 if (!shader_buffers_reverse_access_mask &&
 472                     !(info->images_buffers & images_reverse_access_mask) &&
 473                     !bindless_buffer_reverse_access_mask)
 474                         return true;
 475         } else {
 476                 if (!(~info->images_buffers & images_reverse_access_mask) &&
 477                     !bindless_image_reverse_access_mask)
 478                         return true;
 479         }
 480         return false;
 481 }
 482
 483 static void load_emit(
 484                 const struct lp_build_tgsi_action *action,
 485                 struct lp_build_tgsi_context *bld_base,
 486                 struct lp_build_emit_data *emit_data)
 487 {
 488         struct si_shader_context *ctx = si_shader_context(bld_base);
 489         const struct tgsi_full_instruction * inst = emit_data->inst;
 490         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 491         bool can_speculate = false;
 492         LLVMValueRef vindex = ctx->i32_0;
 493         LLVMValueRef voffset = ctx->i32_0;
 494         struct ac_image_args args = {};
 495
 496         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 497                 load_emit_memory(ctx, emit_data);
 498                 return;
 499         }
 500
 501         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 502             inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 503                 bool ubo = inst->Src[0].Register.File == TGSI_FILE_CONSTBUF;
 504                 args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], ubo);
 505                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
 506         } else {
 507                 unsigned target = inst->Memory.Texture;
 508
 509                 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &args.resource);
 510                 image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
 511                 vindex = args.coords[0]; /* for buffers only */
 512         }
 513
 514         if (inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 515                 emit_data->output[emit_data->chan] =
 516                         ac_build_buffer_load(&ctx->ac, args.resource,
 517                                              util_last_bit(inst->Dst[0].Register.WriteMask),
 518                                              NULL, voffset, NULL, 0, 0, true, true);
 519                 return;
 520         }
 521
 522         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 523                 ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
 524
 525         can_speculate = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
 526                           is_oneway_access_only(inst, info,
 527                                                 info->shader_buffers_store |
 528                                                 info->shader_buffers_atomic,
 529                                                 info->images_store |
 530                                                 info->images_atomic,
 531                                                 info->uses_bindless_buffer_store |
 532                                                 info->uses_bindless_buffer_atomic,
 533                                                 info->uses_bindless_image_store |
 534                                                 info->uses_bindless_image_atomic);
 535         args.cache_policy = get_cache_policy(ctx, inst, false, false, false);
 536
 537         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 538                 /* Don't use SMEM for shader buffer loads, because LLVM doesn't
 539                  * select SMEM for SI.load.const with a non-constant offset, and
 540                  * constant offsets practically don't exist with shader buffers.
 541                  *
 542                  * Also, SI.load.const doesn't use inst_offset when it's lowered
 543                  * to VMEM, so we just end up with more VALU instructions in the end
 544                  * and no benefit.
 545                  *
 546                  * TODO: Remove this line once LLVM can select SMEM with a non-constant
 547                  *       offset, and can derive inst_offset when VMEM is selected.
 548                  *       After that, si_memory_barrier should invalidate sL1 for shader
 549                  *       buffers.
 550                  */
 551                 emit_data->output[emit_data->chan] =
 552                         ac_build_buffer_load(&ctx->ac, args.resource,
 553                                              util_last_bit(inst->Dst[0].Register.WriteMask),
 554                                              NULL, voffset, NULL, 0,
 555                                              args.cache_policy, can_speculate, false);
 556                 return;
 557         }
 558
 559         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 560                 unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
 561                 LLVMValueRef result =
 562                         ac_build_buffer_load_format(&ctx->ac,
 563                                                     args.resource,
 564                                                     vindex,
 565                                                     ctx->i32_0,
 566                                                     num_channels,
 567                                                     args.cache_policy,
 568                                                     can_speculate);
 569                 emit_data->output[emit_data->chan] =
 570                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
 571         } else {
 572                 args.opcode = ac_image_load;
 573                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 574                 args.attributes = ac_get_load_intr_attribs(can_speculate);
 575                 args.dmask = 0xf;
 576
 577                 emit_data->output[emit_data->chan] =
 578                         ac_build_image_opcode(&ctx->ac, &args);
 579         }
 580 }
 581
 582 static void store_emit_buffer(struct si_shader_context *ctx,
 583                               LLVMValueRef resource,
 584                               unsigned writemask,
 585                               LLVMValueRef value,
 586                               LLVMValueRef voffset,
 587                               unsigned cache_policy,
 588                               bool writeonly_memory)
 589 {
 590         LLVMBuilderRef builder = ctx->ac.builder;
 591         LLVMValueRef base_data = value;
 592         LLVMValueRef base_offset = voffset;
 593
 594         while (writemask) {
 595                 int start, count;
 596                 LLVMValueRef data, voff;
 597
 598                 u_bit_scan_consecutive_range(&writemask, &start, &count);
 599
 600                 if (count == 3 && ac_has_vec3_support(ctx->ac.chip_class, false)) {
 601                         LLVMValueRef values[3] = {
 602                                 LLVMBuildExtractElement(builder, base_data,
 603                                                         LLVMConstInt(ctx->i32, start, 0), ""),
 604                                 LLVMBuildExtractElement(builder, base_data,
 605                                                         LLVMConstInt(ctx->i32, start + 1, 0), ""),
 606                                 LLVMBuildExtractElement(builder, base_data,
 607                                                         LLVMConstInt(ctx->i32, start + 2, 0), ""),
 608                         };
 609                         data = ac_build_gather_values(&ctx->ac, values, 3);
 610                 } else if (count >= 3) {
 611                         data = base_data;
 612                 } else if (count == 2) {
 613                         LLVMValueRef values[2] = {
 614                                 LLVMBuildExtractElement(builder, base_data,
 615                                                         LLVMConstInt(ctx->i32, start, 0), ""),
 616                                 LLVMBuildExtractElement(builder, base_data,
 617                                                         LLVMConstInt(ctx->i32, start + 1, 0), ""),
 618                         };
 619
 620                         data = ac_build_gather_values(&ctx->ac, values, 2);
 621                 } else {
 622                         assert(count == 1);
 623                         data = LLVMBuildExtractElement(
 624                                 builder, base_data,
 625                                 LLVMConstInt(ctx->i32, start, 0), "");
 626                 }
 627
 628                 voff = base_offset;
 629                 if (start != 0) {
 630                         voff = LLVMBuildAdd(
 631                                 builder, voff,
 632                                 LLVMConstInt(ctx->i32, start * 4, 0), "");
 633                 }
 634
 635                 ac_build_buffer_store_dword(&ctx->ac, resource, data, count,
 636                                             voff, ctx->i32_0, 0, cache_policy,
 637                                             false);
 638         }
 639 }
 640
 641 static void store_emit_memory(
 642                 struct si_shader_context *ctx,
 643                 struct lp_build_emit_data *emit_data)
 644 {
 645         const struct tgsi_full_instruction *inst = emit_data->inst;
 646         LLVMBuilderRef builder = ctx->ac.builder;
 647         unsigned writemask = inst->Dst[0].Register.WriteMask;
 648         LLVMValueRef ptr, derived_ptr, data, index;
 649         int chan;
 650
 651         ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
 652
 653         for (chan = 0; chan < 4; ++chan) {
 654                 if (!(writemask & (1 << chan))) {
 655                         continue;
 656                 }
 657                 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
 658                 index = LLVMConstInt(ctx->i32, chan, 0);
 659                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
 660                 LLVMBuildStore(builder, data, derived_ptr);
 661         }
 662 }
 663
 664 static void store_emit(
 665                 const struct lp_build_tgsi_action *action,
 666                 struct lp_build_tgsi_context *bld_base,
 667                 struct lp_build_emit_data *emit_data)
 668 {
 669         struct si_shader_context *ctx = si_shader_context(bld_base);
 670         const struct tgsi_full_instruction * inst = emit_data->inst;
 671         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 672         struct tgsi_full_src_register resource_reg =
 673                 tgsi_full_src_register_from_dst(&inst->Dst[0]);
 674         unsigned target = inst->Memory.Texture;
 675
 676         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
 677                 store_emit_memory(ctx, emit_data);
 678                 return;
 679         }
 680
 681         bool writeonly_memory = is_oneway_access_only(inst, info,
 682                                                       info->shader_buffers_load |
 683                                                       info->shader_buffers_atomic,
 684                                                       info->images_load |
 685                                                       info->images_atomic,
 686                                                       info->uses_bindless_buffer_load |
 687                                                       info->uses_bindless_buffer_atomic,
 688                                                       info->uses_bindless_image_load |
 689                                                       info->uses_bindless_image_atomic);
 690         LLVMValueRef chans[4];
 691         LLVMValueRef vindex = ctx->i32_0;
 692         LLVMValueRef voffset = ctx->i32_0;
 693         struct ac_image_args args = {};
 694
 695         for (unsigned chan = 0; chan < 4; ++chan)
 696                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
 697
 698         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 699                 args.resource = shader_buffer_fetch_rsrc(ctx, &resource_reg, false);
 700                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 0, 0));
 701         } else {
 702                 image_fetch_rsrc(bld_base, &resource_reg, true, target, &args.resource);
 703                 image_fetch_coords(bld_base, inst, 0, args.resource, args.coords);
 704                 vindex = args.coords[0]; /* for buffers only */
 705         }
 706
 707         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 708                 ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
 709
 710         bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER;
 711         args.cache_policy = get_cache_policy(ctx, inst,
 712                                              false, /* atomic */
 713                                              is_image, /* may_store_unaligned */
 714                                              writeonly_memory);
 715
 716         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 717                 store_emit_buffer(ctx, args.resource, inst->Dst[0].Register.WriteMask,
 718                                   ac_build_gather_values(&ctx->ac, chans, 4),
 719                                   voffset, args.cache_policy, writeonly_memory);
 720                 return;
 721         }
 722
 723         if (target == TGSI_TEXTURE_BUFFER) {
 724                 unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
 725
 726                 ac_build_buffer_store_format(&ctx->ac, args.resource,
 727                                              ac_build_gather_values(&ctx->ac, chans, num_channels),
 728                                              vindex, ctx->i32_0 /* voffset */,
 729                                              num_channels,
 730                                              args.cache_policy);
 731         } else {
 732                 args.opcode = ac_image_store;
 733                 args.data[0] = ac_build_gather_values(&ctx->ac, chans, 4);
 734                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 735                 args.attributes = AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY;
 736                 args.dmask = 0xf;
 737
 738                 emit_data->output[emit_data->chan] =
 739                         ac_build_image_opcode(&ctx->ac, &args);
 740         }
 741 }
 742
 743 static void atomic_emit_memory(struct si_shader_context *ctx,
 744                                struct lp_build_emit_data *emit_data) {
 745         LLVMBuilderRef builder = ctx->ac.builder;
 746         const struct tgsi_full_instruction * inst = emit_data->inst;
 747         LLVMValueRef ptr, result, arg;
 748         const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
 749
 750         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
 751
 752         arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
 753         arg = ac_to_integer(&ctx->ac, arg);
 754
 755         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 756                 LLVMValueRef new_data;
 757                 new_data = lp_build_emit_fetch(&ctx->bld_base,
 758                                                inst, 3, 0);
 759
 760                 new_data = ac_to_integer(&ctx->ac, new_data);
 761
 762                 result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, arg, new_data,
 763                                                   sync_scope);
 764                 result = LLVMBuildExtractValue(builder, result, 0, "");
 765         } else {
 766                 LLVMAtomicRMWBinOp op;
 767
 768                 switch(inst->Instruction.Opcode) {
 769                         case TGSI_OPCODE_ATOMUADD:
 770                                 op = LLVMAtomicRMWBinOpAdd;
 771                                 break;
 772                         case TGSI_OPCODE_ATOMXCHG:
 773                                 op = LLVMAtomicRMWBinOpXchg;
 774                                 break;
 775                         case TGSI_OPCODE_ATOMAND:
 776                                 op = LLVMAtomicRMWBinOpAnd;
 777                                 break;
 778                         case TGSI_OPCODE_ATOMOR:
 779                                 op = LLVMAtomicRMWBinOpOr;
 780                                 break;
 781                         case TGSI_OPCODE_ATOMXOR:
 782                                 op = LLVMAtomicRMWBinOpXor;
 783                                 break;
 784                         case TGSI_OPCODE_ATOMUMIN:
 785                                 op = LLVMAtomicRMWBinOpUMin;
 786                                 break;
 787                         case TGSI_OPCODE_ATOMUMAX:
 788                                 op = LLVMAtomicRMWBinOpUMax;
 789                                 break;
 790                         case TGSI_OPCODE_ATOMIMIN:
 791                                 op = LLVMAtomicRMWBinOpMin;
 792                                 break;
 793                         case TGSI_OPCODE_ATOMIMAX:
 794                                 op = LLVMAtomicRMWBinOpMax;
 795                                 break;
 796                         default:
 797                                 unreachable("unknown atomic opcode");
 798                 }
 799
 800                 result = ac_build_atomic_rmw(&ctx->ac, op, ptr, arg, sync_scope);
 801         }
 802         emit_data->output[emit_data->chan] =
 803                 LLVMBuildBitCast(builder, result, ctx->f32, "");
 804 }
 805
 806 static void atomic_emit(
 807                 const struct lp_build_tgsi_action *action,
 808                 struct lp_build_tgsi_context *bld_base,
 809                 struct lp_build_emit_data *emit_data)
 810 {
 811         struct si_shader_context *ctx = si_shader_context(bld_base);
 812         const struct tgsi_full_instruction * inst = emit_data->inst;
 813         struct ac_image_args args = {};
 814         unsigned num_data = 0;
 815         LLVMValueRef vindex = ctx->i32_0;
 816         LLVMValueRef voffset = ctx->i32_0;
 817
 818         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 819                 atomic_emit_memory(ctx, emit_data);
 820                 return;
 821         }
 822
 823         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 824                 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
 825                  * of arguments, which is reversed relative to TGSI (and GLSL)
 826                  */
 827                 args.data[num_data++] =
 828                         ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 3, 0));
 829         }
 830
 831         args.data[num_data++] =
 832                 ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
 833
 834         args.cache_policy = get_cache_policy(ctx, inst, true, false, false);
 835
 836         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 837                 args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);
 838                 voffset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
 839         } else {
 840                 image_fetch_rsrc(bld_base, &inst->Src[0], true,
 841                                 inst->Memory.Texture, &args.resource);
 842                 image_fetch_coords(bld_base, inst, 1, args.resource, args.coords);
 843                 vindex = args.coords[0]; /* for buffers only */
 844         }
 845
 846         if (inst->Src[0].Register.File != TGSI_FILE_BUFFER &&
 847             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 848                 LLVMValueRef buf_args[7];
 849                 unsigned num_args = 0;
 850
 851                 buf_args[num_args++] = args.data[0];
 852                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 853                         buf_args[num_args++] = args.data[1];
 854
 855                 buf_args[num_args++] = args.resource;
 856                 buf_args[num_args++] = vindex;
 857                 buf_args[num_args++] = voffset;
 858                 buf_args[num_args++] = ctx->i32_0; /* soffset */
 859                 buf_args[num_args++] = LLVMConstInt(ctx->i32, args.cache_policy & ac_slc, 0);
 860
 861                 char intrinsic_name[64];
 862                 snprintf(intrinsic_name, sizeof(intrinsic_name),
 863                          "llvm.amdgcn.struct.buffer.atomic.%s", action->intr_name);
 864                 emit_data->output[emit_data->chan] =
 865                         ac_to_float(&ctx->ac,
 866                                     ac_build_intrinsic(&ctx->ac, intrinsic_name,
 867                                                        ctx->i32, buf_args, num_args, 0));
 868                 return;
 869         }
 870
 871         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 872                 LLVMValueRef buf_args[7];
 873                 unsigned num_args = 0;
 874
 875                 buf_args[num_args++] = args.data[0];
 876                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
 877                         buf_args[num_args++] = args.data[1];
 878
 879                 buf_args[num_args++] = args.resource;
 880                 buf_args[num_args++] = vindex;
 881                 buf_args[num_args++] = voffset;
 882                 buf_args[num_args++] = args.cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
 883
 884                 char intrinsic_name[40];
 885                 snprintf(intrinsic_name, sizeof(intrinsic_name),
 886                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
 887                 emit_data->output[emit_data->chan] =
 888                         ac_to_float(&ctx->ac,
 889                                     ac_build_intrinsic(&ctx->ac, intrinsic_name,
 890                                                        ctx->i32, buf_args, num_args, 0));
 891         } else {
 892                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 893                         args.opcode = ac_image_atomic_cmpswap;
 894                 } else {
 895                         args.opcode = ac_image_atomic;
 896                         switch (inst->Instruction.Opcode) {
 897                         case TGSI_OPCODE_ATOMXCHG: args.atomic = ac_atomic_swap; break;
 898                         case TGSI_OPCODE_ATOMUADD: args.atomic = ac_atomic_add; break;
 899                         case TGSI_OPCODE_ATOMAND: args.atomic = ac_atomic_and; break;
 900                         case TGSI_OPCODE_ATOMOR: args.atomic = ac_atomic_or; break;
 901                         case TGSI_OPCODE_ATOMXOR: args.atomic = ac_atomic_xor; break;
 902                         case TGSI_OPCODE_ATOMUMIN: args.atomic = ac_atomic_umin; break;
 903                         case TGSI_OPCODE_ATOMUMAX: args.atomic = ac_atomic_umax; break;
 904                         case TGSI_OPCODE_ATOMIMIN: args.atomic = ac_atomic_smin; break;
 905                         case TGSI_OPCODE_ATOMIMAX: args.atomic = ac_atomic_smax; break;
 906                         case TGSI_OPCODE_ATOMINC_WRAP:
 907                                 args.atomic = ac_atomic_inc_wrap;
 908                                 break;
 909                         case TGSI_OPCODE_ATOMDEC_WRAP:
 910                                 args.atomic = ac_atomic_dec_wrap;
 911                                 break;
 912                         default: unreachable("unhandled image atomic");
 913                         }
 914                 }
 915
 916                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 917                 emit_data->output[emit_data->chan] =
 918                         ac_to_float(&ctx->ac, ac_build_image_opcode(&ctx->ac, &args));
 919         }
 920 }
 921
 922 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
 923                                 unsigned target, LLVMValueRef out)
 924 {
 925         LLVMBuilderRef builder = ctx->ac.builder;
 926
 927         /* 1D textures are allocated and used as 2D on GFX9. */
 928         if (ctx->screen->info.chip_class == GFX9 &&
 929             (target == TGSI_TEXTURE_1D_ARRAY ||
 930              target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
 931                 LLVMValueRef layers =
 932                         LLVMBuildExtractElement(builder, out,
 933                                                 LLVMConstInt(ctx->i32, 2, 0), "");
 934                 out = LLVMBuildInsertElement(builder, out, layers,
 935                                              ctx->i32_1, "");
 936         }
 937
 938         /* Divide the number of layers by 6 to get the number of cubes. */
 939         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 940             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
 941                 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
 942
 943                 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
 944                 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
 945
 946                 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
 947         }
 948         return out;
 949 }
 950
 951 static void resq_emit(
 952                 const struct lp_build_tgsi_action *action,
 953                 struct lp_build_tgsi_context *bld_base,
 954                 struct lp_build_emit_data *emit_data)
 955 {
 956         struct si_shader_context *ctx = si_shader_context(bld_base);
 957         LLVMBuilderRef builder = ctx->ac.builder;
 958         const struct tgsi_full_instruction *inst = emit_data->inst;
 959         const struct tgsi_full_src_register *reg =
 960                 &inst->Src[inst->Instruction.Opcode == TGSI_OPCODE_TXQ ? 1 : 0];
 961
 962         if (reg->Register.File == TGSI_FILE_BUFFER) {
 963                 LLVMValueRef rsrc = shader_buffer_fetch_rsrc(ctx, reg, false);
 964
 965                 emit_data->output[emit_data->chan] =
 966                         LLVMBuildExtractElement(builder, rsrc,
 967                                                 LLVMConstInt(ctx->i32, 2, 0), "");
 968                 return;
 969         }
 970
 971         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
 972             inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
 973                 LLVMValueRef rsrc;
 974
 975                 tex_fetch_ptrs(bld_base, emit_data, &rsrc, NULL, NULL);
 976                 /* Read the size from the buffer descriptor directly. */
 977                 emit_data->output[emit_data->chan] =
 978                         get_buffer_size(bld_base, rsrc);
 979                 return;
 980         }
 981
 982         if (inst->Instruction.Opcode == TGSI_OPCODE_RESQ &&
 983             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 984                 LLVMValueRef rsrc;
 985
 986                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, &rsrc);
 987                 emit_data->output[emit_data->chan] =
 988                         get_buffer_size(bld_base, rsrc);
 989                 return;
 990         }
 991
 992         unsigned target;
 993
 994         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
 995                 target = inst->Texture.Texture;
 996         } else {
 997                 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
 998                         target = TGSI_TEXTURE_2D_ARRAY;
 999                 else
1000                         target = inst->Memory.Texture;
1001         }
1002
1003         struct ac_image_args args = {};
1004         args.opcode = ac_image_get_resinfo;
1005         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1006         args.dmask = 0xf;
1007         args.attributes = AC_FUNC_ATTR_READNONE;
1008
1009         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1010                 tex_fetch_ptrs(bld_base, emit_data, &args.resource, NULL, NULL);
1011                 args.lod = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1012         } else {
1013                 image_fetch_rsrc(bld_base, reg, false, target, &args.resource);
1014                 args.lod = ctx->i32_0;
1015         }
1016
1017         emit_data->output[emit_data->chan] =
1018                 fix_resinfo(ctx, target, ac_build_image_opcode(&ctx->ac, &args));
1019
1020         if (inst->Instruction.Opcode == TGSI_OPCODE_RESQ &&
1021             (target == TGSI_TEXTURE_2D_MSAA ||
1022              target == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
1023                 LLVMValueRef samples =
1024                         ac_build_image_get_sample_count(&ctx->ac, args.resource);
1025
1026                 emit_data->output[emit_data->chan] =
1027                         LLVMBuildInsertElement(ctx->ac.builder,
1028                                                emit_data->output[emit_data->chan],
1029                                                samples,
1030                                                LLVMConstInt(ctx->i32, 3, 0), "");
1031         }
1032 }
1033
1034 /**
1035  * Load an image view, fmask view. or sampler state descriptor.
1036  */
1037 LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
1038                                   LLVMValueRef list, LLVMValueRef index,
1039                                   enum ac_descriptor_type type)
1040 {
1041         LLVMBuilderRef builder = ctx->ac.builder;
1042
1043         switch (type) {
1044         case AC_DESC_IMAGE:
1045                 /* The image is at [0:7]. */
1046                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1047                 break;
1048         case AC_DESC_BUFFER:
1049                 /* The buffer is in [4:7]. */
1050                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1051                                       ctx->i32_1);
1052                 list = LLVMBuildPointerCast(builder, list,
1053                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1054                 break;
1055         case AC_DESC_FMASK:
1056                 /* The FMASK is at [8:15]. */
1057                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
1058                                       ctx->i32_1);
1059                 break;
1060         case AC_DESC_SAMPLER:
1061                 /* The sampler state is at [12:15]. */
1062                 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 4, 0),
1063                                       LLVMConstInt(ctx->i32, 3, 0));
1064                 list = LLVMBuildPointerCast(builder, list,
1065                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1066                 break;
1067         case AC_DESC_PLANE_0:
1068         case AC_DESC_PLANE_1:
1069         case AC_DESC_PLANE_2:
1070                 /* Only used for the multiplane image support for Vulkan. Should
1071                  * never be reached in radeonsi.
1072                  */
1073                 unreachable("Plane descriptor requested in radeonsi.");
1074         }
1075
1076         return ac_build_load_to_sgpr(&ctx->ac, list, index);
1077 }
1078
1079 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
1080  *
1081  * GFX6-GFX7:
1082  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
1083  *   filtering manually. The driver sets img7 to a mask clearing
1084  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
1085  *     s_and_b32 samp0, samp0, img7
1086  *
1087  * GFX8:
1088  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
1089  */
1090 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
1091                                            LLVMValueRef res, LLVMValueRef samp)
1092 {
1093         LLVMValueRef img7, samp0;
1094
1095         if (ctx->screen->info.chip_class >= GFX8)
1096                 return samp;
1097
1098         img7 = LLVMBuildExtractElement(ctx->ac.builder, res,
1099                                        LLVMConstInt(ctx->i32, 7, 0), "");
1100         samp0 = LLVMBuildExtractElement(ctx->ac.builder, samp,
1101                                         ctx->i32_0, "");
1102         samp0 = LLVMBuildAnd(ctx->ac.builder, samp0, img7, "");
1103         return LLVMBuildInsertElement(ctx->ac.builder, samp, samp0,
1104                                       ctx->i32_0, "");
1105 }
1106
1107 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
1108                            struct lp_build_emit_data *emit_data,
1109                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
1110                            LLVMValueRef *fmask_ptr)
1111 {
1112         struct si_shader_context *ctx = si_shader_context(bld_base);
1113         LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
1114         const struct tgsi_full_instruction *inst = emit_data->inst;
1115         const struct tgsi_full_src_register *reg;
1116         unsigned target = inst->Texture.Texture;
1117         unsigned sampler_src;
1118         LLVMValueRef index;
1119
1120         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
1121         reg = &emit_data->inst->Src[sampler_src];
1122
1123         if (reg->Register.Indirect) {
1124                 index = si_get_bounded_indirect_index(ctx,
1125                                                       &reg->Indirect,
1126                                                       reg->Register.Index,
1127                                                       ctx->num_samplers);
1128                 index = LLVMBuildAdd(ctx->ac.builder, index,
1129                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
1130         } else {
1131                 index = LLVMConstInt(ctx->i32,
1132                                      si_get_sampler_slot(reg->Register.Index), 0);
1133         }
1134
1135         if (reg->Register.File != TGSI_FILE_SAMPLER) {
1136                 /* Bindless descriptors are accessible from a different pair of
1137                  * user SGPR indices.
1138                  */
1139                 list = LLVMGetParam(ctx->main_fn,
1140                                     ctx->param_bindless_samplers_and_images);
1141                 index = lp_build_emit_fetch_src(bld_base, reg,
1142                                                 TGSI_TYPE_UNSIGNED, 0);
1143
1144                 /* Since bindless handle arithmetic can contain an unsigned integer
1145                  * wraparound and si_load_sampler_desc assumes there isn't any,
1146                  * use GEP without "inbounds" (inside ac_build_pointer_add)
1147                  * to prevent incorrect code generation and hangs.
1148                  */
1149                 index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1150                 list = ac_build_pointer_add(&ctx->ac, list, index);
1151                 index = ctx->i32_0;
1152         }
1153
1154         if (target == TGSI_TEXTURE_BUFFER)
1155                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER);
1156         else
1157                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE);
1158
1159         if (samp_ptr)
1160                 *samp_ptr = NULL;
1161         if (fmask_ptr)
1162                 *fmask_ptr = NULL;
1163
1164         if (target == TGSI_TEXTURE_2D_MSAA ||
1165             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1166                 if (fmask_ptr)
1167                         *fmask_ptr = si_load_sampler_desc(ctx, list, index,
1168                                                           AC_DESC_FMASK);
1169         } else if (target != TGSI_TEXTURE_BUFFER) {
1170                 if (samp_ptr) {
1171                         *samp_ptr = si_load_sampler_desc(ctx, list, index,
1172                                                          AC_DESC_SAMPLER);
1173                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
1174                 }
1175         }
1176 }
1177
1178 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
1179  * incorrectly forces nearest filtering if the texture format is integer.
1180  * The only effect it has on Gather4, which always returns 4 texels for
1181  * bilinear filtering, is that the final coordinates are off by 0.5 of
1182  * the texel size.
1183  *
1184  * The workaround is to subtract 0.5 from the unnormalized coordinates,
1185  * or (0.5 / size) from the normalized coordinates.
1186  *
1187  * However, cube textures with 8_8_8_8 data formats require a different
1188  * workaround of overriding the num format to USCALED/SSCALED. This would lose
1189  * precision in 32-bit data formats, so it needs to be applied dynamically at
1190  * runtime. In this case, return an i1 value that indicates whether the
1191  * descriptor was overridden (and hence a fixup of the sampler result is needed).
1192  */
1193 static LLVMValueRef
1194 si_lower_gather4_integer(struct si_shader_context *ctx,
1195                          struct ac_image_args *args,
1196                          unsigned target,
1197                          enum tgsi_return_type return_type)
1198 {
1199         LLVMBuilderRef builder = ctx->ac.builder;
1200         LLVMValueRef wa_8888 = NULL;
1201         LLVMValueRef half_texel[2];
1202
1203         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1204                return_type == TGSI_RETURN_TYPE_UINT);
1205
1206         if (target == TGSI_TEXTURE_CUBE ||
1207             target == TGSI_TEXTURE_CUBE_ARRAY) {
1208                 LLVMValueRef formats;
1209                 LLVMValueRef data_format;
1210                 LLVMValueRef wa_formats;
1211
1212                 formats = LLVMBuildExtractElement(builder, args->resource, ctx->i32_1, "");
1213
1214                 data_format = LLVMBuildLShr(builder, formats,
1215                                             LLVMConstInt(ctx->i32, 20, false), "");
1216                 data_format = LLVMBuildAnd(builder, data_format,
1217                                            LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
1218                 wa_8888 = LLVMBuildICmp(
1219                         builder, LLVMIntEQ, data_format,
1220                         LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
1221                         "");
1222
1223                 uint32_t wa_num_format =
1224                         return_type == TGSI_RETURN_TYPE_UINT ?
1225                         S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED) :
1226                         S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED);
1227                 wa_formats = LLVMBuildAnd(builder, formats,
1228                                           LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false),
1229                                           "");
1230                 wa_formats = LLVMBuildOr(builder, wa_formats,
1231                                         LLVMConstInt(ctx->i32, wa_num_format, false), "");
1232
1233                 formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, "");
1234                 args->resource = LLVMBuildInsertElement(
1235                         builder, args->resource, formats, ctx->i32_1, "");
1236         }
1237
1238         if (target == TGSI_TEXTURE_RECT ||
1239             target == TGSI_TEXTURE_SHADOWRECT) {
1240                 assert(!wa_8888);
1241                 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
1242         } else {
1243                 struct ac_image_args resinfo = {};
1244                 struct lp_build_if_state if_ctx;
1245
1246                 if (wa_8888) {
1247                         /* Skip the texture size query entirely if we don't need it. */
1248                         lp_build_if(&if_ctx, &ctx->gallivm, LLVMBuildNot(builder, wa_8888, ""));
1249                 }
1250
1251                 /* Query the texture size. */
1252                 resinfo.opcode = ac_image_get_resinfo;
1253                 resinfo.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1254                 resinfo.resource = args->resource;
1255                 resinfo.sampler = args->sampler;
1256                 resinfo.lod = ctx->ac.i32_0;
1257                 resinfo.dmask = 0xf;
1258                 resinfo.attributes = AC_FUNC_ATTR_READNONE;
1259
1260                 LLVMValueRef texsize =
1261                         fix_resinfo(ctx, target,
1262                                     ac_build_image_opcode(&ctx->ac, &resinfo));
1263
1264                 /* Compute -0.5 / size. */
1265                 for (unsigned c = 0; c < 2; c++) {
1266                         half_texel[c] =
1267                                 LLVMBuildExtractElement(builder, texsize,
1268                                                         LLVMConstInt(ctx->i32, c, 0), "");
1269                         half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
1270                         half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, half_texel[c]);
1271                         half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
1272                                                       LLVMConstReal(ctx->f32, -0.5), "");
1273                 }
1274
1275                 if (wa_8888) {
1276                         lp_build_endif(&if_ctx);
1277
1278                         LLVMBasicBlockRef bb[2] = { if_ctx.true_block, if_ctx.entry_block };
1279
1280                         for (unsigned c = 0; c < 2; c++) {
1281                                 LLVMValueRef values[2] = { half_texel[c], ctx->ac.f32_0 };
1282                                 half_texel[c] = ac_build_phi(&ctx->ac, ctx->f32, 2,
1283                                                              values, bb);
1284                         }
1285                 }
1286         }
1287
1288         for (unsigned c = 0; c < 2; c++) {
1289                 LLVMValueRef tmp;
1290                 tmp = ac_to_float(&ctx->ac, args->coords[c]);
1291                 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
1292                 args->coords[c] = ac_to_integer(&ctx->ac, tmp);
1293         }
1294
1295         return wa_8888;
1296 }
1297
1298 /* The second half of the cube texture 8_8_8_8 integer workaround: adjust the
1299  * result after the gather operation.
1300  */
1301 static LLVMValueRef
1302 si_fix_gather4_integer_result(struct si_shader_context *ctx,
1303                            LLVMValueRef result,
1304                            enum tgsi_return_type return_type,
1305                            LLVMValueRef wa)
1306 {
1307         LLVMBuilderRef builder = ctx->ac.builder;
1308
1309         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1310                return_type == TGSI_RETURN_TYPE_UINT);
1311
1312         for (unsigned chan = 0; chan < 4; ++chan) {
1313                 LLVMValueRef chanv = LLVMConstInt(ctx->i32, chan, false);
1314                 LLVMValueRef value;
1315                 LLVMValueRef wa_value;
1316
1317                 value = LLVMBuildExtractElement(builder, result, chanv, "");
1318
1319                 if (return_type == TGSI_RETURN_TYPE_UINT)
1320                         wa_value = LLVMBuildFPToUI(builder, value, ctx->i32, "");
1321                 else
1322                         wa_value = LLVMBuildFPToSI(builder, value, ctx->i32, "");
1323                 wa_value = ac_to_float(&ctx->ac, wa_value);
1324                 value = LLVMBuildSelect(builder, wa, wa_value, value, "");
1325
1326                 result = LLVMBuildInsertElement(builder, result, value, chanv, "");
1327         }
1328
1329         return result;
1330 }
1331
1332 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
1333                                 struct lp_build_tgsi_context *bld_base,
1334                                 struct lp_build_emit_data *emit_data)
1335 {
1336         struct si_shader_context *ctx = si_shader_context(bld_base);
1337         const struct tgsi_full_instruction *inst = emit_data->inst;
1338         unsigned opcode = inst->Instruction.Opcode;
1339         unsigned target = inst->Texture.Texture;
1340         struct ac_image_args args = {};
1341         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
1342         unsigned chan;
1343         bool has_offset = inst->Texture.NumOffsets > 0;
1344         LLVMValueRef fmask_ptr = NULL;
1345
1346         tex_fetch_ptrs(bld_base, emit_data, &args.resource, &args.sampler, &fmask_ptr);
1347
1348         if (target == TGSI_TEXTURE_BUFFER) {
1349                 LLVMValueRef vindex = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1350                 unsigned num_channels =
1351                         util_last_bit(inst->Dst[0].Register.WriteMask);
1352                 LLVMValueRef result =
1353                         ac_build_buffer_load_format(&ctx->ac,
1354                                                     args.resource,
1355                                                     vindex,
1356                                                     ctx->i32_0,
1357                                                     num_channels, 0, true);
1358                 emit_data->output[emit_data->chan] =
1359                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
1360                 return;
1361         }
1362
1363         /* Fetch and project texture coordinates */
1364         args.coords[3] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_W);
1365         for (chan = 0; chan < 3; chan++) {
1366                 args.coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
1367                 if (opcode == TGSI_OPCODE_TXP)
1368                         args.coords[chan] = ac_build_fdiv(&ctx->ac,
1369                                 args.coords[chan], args.coords[3]);
1370         }
1371
1372         if (opcode == TGSI_OPCODE_TXP)
1373                 args.coords[3] = ctx->ac.f32_1;
1374
1375         /* Pack offsets. */
1376         if (has_offset &&
1377             opcode != TGSI_OPCODE_TXF &&
1378             opcode != TGSI_OPCODE_TXF_LZ) {
1379                 /* The offsets are six-bit signed integers packed like this:
1380                  *   X=[5:0], Y=[13:8], and Z=[21:16].
1381                  */
1382                 LLVMValueRef offset[3], pack;
1383
1384                 assert(inst->Texture.NumOffsets == 1);
1385
1386                 for (chan = 0; chan < 3; chan++) {
1387                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base, inst, 0, chan);
1388                         offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
1389                                                     LLVMConstInt(ctx->i32, 0x3f, 0), "");
1390                         if (chan)
1391                                 offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
1392                                                             LLVMConstInt(ctx->i32, chan*8, 0), "");
1393                 }
1394
1395                 pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
1396                 pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
1397                 args.offset = pack;
1398         }
1399
1400         /* Pack LOD bias value */
1401         if (opcode == TGSI_OPCODE_TXB)
1402                 args.bias = args.coords[3];
1403         if (opcode == TGSI_OPCODE_TXB2)
1404                 args.bias = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1405
1406         /* Pack depth comparison value */
1407         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
1408                 LLVMValueRef z;
1409
1410                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1411                         z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1412                 } else {
1413                         assert(ref_pos >= 0);
1414                         z = args.coords[ref_pos];
1415                 }
1416
1417                 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
1418                  * OpenGL 4.5 spec says:
1419                  *
1420                  *    "If the texture’s internal format indicates a fixed-point
1421                  *     depth texture, then D_t and D_ref are clamped to the
1422                  *     range [0, 1]; otherwise no clamping is performed."
1423                  *
1424                  * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
1425                  * so the depth comparison value isn't clamped for Z16 and
1426                  * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
1427                  * an explicitly clamped 32-bit float format.
1428                  */
1429                 if (ctx->screen->info.chip_class >= GFX8 &&
1430                     ctx->screen->info.chip_class <= GFX9) {
1431                         LLVMValueRef upgraded;
1432                         LLVMValueRef clamped;
1433                         upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
1434                                                            LLVMConstInt(ctx->i32, 3, false), "");
1435                         upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded,
1436                                                  LLVMConstInt(ctx->i32, 29, false), "");
1437                         upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->i1, "");
1438                         clamped = ac_build_clamp(&ctx->ac, z);
1439                         z = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, z, "");
1440                 }
1441
1442                 args.compare = z;
1443         }
1444
1445         /* Pack user derivatives */
1446         if (opcode == TGSI_OPCODE_TXD) {
1447                 int param, num_src_deriv_channels, num_dst_deriv_channels;
1448
1449                 switch (target) {
1450                 case TGSI_TEXTURE_3D:
1451                         num_src_deriv_channels = 3;
1452                         num_dst_deriv_channels = 3;
1453                         break;
1454                 case TGSI_TEXTURE_2D:
1455                 case TGSI_TEXTURE_SHADOW2D:
1456                 case TGSI_TEXTURE_RECT:
1457                 case TGSI_TEXTURE_SHADOWRECT:
1458                 case TGSI_TEXTURE_2D_ARRAY:
1459                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
1460                         num_src_deriv_channels = 2;
1461                         num_dst_deriv_channels = 2;
1462                         break;
1463                 case TGSI_TEXTURE_CUBE:
1464                 case TGSI_TEXTURE_SHADOWCUBE:
1465                 case TGSI_TEXTURE_CUBE_ARRAY:
1466                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
1467                         /* Cube derivatives will be converted to 2D. */
1468                         num_src_deriv_channels = 3;
1469                         num_dst_deriv_channels = 3;
1470                         break;
1471                 case TGSI_TEXTURE_1D:
1472                 case TGSI_TEXTURE_SHADOW1D:
1473                 case TGSI_TEXTURE_1D_ARRAY:
1474                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
1475                         num_src_deriv_channels = 1;
1476
1477                         /* 1D textures are allocated and used as 2D on GFX9. */
1478                         if (ctx->screen->info.chip_class == GFX9) {
1479                                 num_dst_deriv_channels = 2;
1480                         } else {
1481                                 num_dst_deriv_channels = 1;
1482                         }
1483                         break;
1484                 default:
1485                         unreachable("invalid target");
1486                 }
1487
1488                 for (param = 0; param < 2; param++) {
1489                         for (chan = 0; chan < num_src_deriv_channels; chan++)
1490                                 args.derivs[param * num_dst_deriv_channels + chan] =
1491                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
1492
1493                         /* Fill in the rest with zeros. */
1494                         for (chan = num_src_deriv_channels;
1495                              chan < num_dst_deriv_channels; chan++)
1496                                 args.derivs[param * num_dst_deriv_channels + chan] =
1497                                         ctx->ac.f32_0;
1498                 }
1499         }
1500
1501         if (target == TGSI_TEXTURE_CUBE ||
1502             target == TGSI_TEXTURE_CUBE_ARRAY ||
1503             target == TGSI_TEXTURE_SHADOWCUBE ||
1504             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1505                 ac_prepare_cube_coords(&ctx->ac,
1506                                        opcode == TGSI_OPCODE_TXD,
1507                                        target == TGSI_TEXTURE_CUBE_ARRAY ||
1508                                        target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
1509                                        opcode == TGSI_OPCODE_LODQ,
1510                                        args.coords, args.derivs);
1511         } else if (tgsi_is_array_sampler(target) &&
1512                    opcode != TGSI_OPCODE_TXF &&
1513                    opcode != TGSI_OPCODE_TXF_LZ &&
1514                    ctx->screen->info.chip_class <= GFX8) {
1515                 unsigned array_coord = target == TGSI_TEXTURE_1D_ARRAY ? 1 : 2;
1516                 args.coords[array_coord] = ac_build_round(&ctx->ac, args.coords[array_coord]);
1517         }
1518
1519         /* 1D textures are allocated and used as 2D on GFX9. */
1520         if (ctx->screen->info.chip_class == GFX9) {
1521                 LLVMValueRef filler;
1522
1523                 /* Use 0.5, so that we don't sample the border color. */
1524                 if (opcode == TGSI_OPCODE_TXF ||
1525                     opcode == TGSI_OPCODE_TXF_LZ)
1526                         filler = ctx->i32_0;
1527                 else
1528                         filler = LLVMConstReal(ctx->f32, 0.5);
1529
1530                 if (target == TGSI_TEXTURE_1D ||
1531                     target == TGSI_TEXTURE_SHADOW1D) {
1532                         args.coords[1] = filler;
1533                 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
1534                            target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
1535                         args.coords[2] = args.coords[1];
1536                         args.coords[1] = filler;
1537                 }
1538         }
1539
1540         /* Pack LOD or sample index */
1541         if (opcode == TGSI_OPCODE_TXL)
1542                 args.lod = args.coords[3];
1543         else if (opcode == TGSI_OPCODE_TXL2)
1544                 args.lod = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1545         else if (opcode == TGSI_OPCODE_TXF) {
1546                 if (target == TGSI_TEXTURE_2D_MSAA) {
1547                         /* No LOD, but move sample index into the right place. */
1548                         args.coords[2] = args.coords[3];
1549                 } else if (target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
1550                         args.lod = args.coords[3];
1551                 }
1552         }
1553
1554         if ((target == TGSI_TEXTURE_2D_MSAA ||
1555              target == TGSI_TEXTURE_2D_ARRAY_MSAA) &&
1556             !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
1557                 ac_apply_fmask_to_sample(&ctx->ac, fmask_ptr, args.coords,
1558                                          target == TGSI_TEXTURE_2D_ARRAY_MSAA);
1559         }
1560
1561         if (opcode == TGSI_OPCODE_TXF ||
1562             opcode == TGSI_OPCODE_TXF_LZ) {
1563                 /* add tex offsets */
1564                 if (inst->Texture.NumOffsets) {
1565                         const struct tgsi_texture_offset *off = inst->TexOffsets;
1566
1567                         assert(inst->Texture.NumOffsets == 1);
1568
1569                         switch (target) {
1570                         case TGSI_TEXTURE_3D:
1571                                 args.coords[2] =
1572                                         LLVMBuildAdd(ctx->ac.builder, args.coords[2],
1573                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ], "");
1574                                 /* fall through */
1575                         case TGSI_TEXTURE_2D:
1576                         case TGSI_TEXTURE_SHADOW2D:
1577                         case TGSI_TEXTURE_RECT:
1578                         case TGSI_TEXTURE_SHADOWRECT:
1579                         case TGSI_TEXTURE_2D_ARRAY:
1580                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
1581                                 args.coords[1] =
1582                                         LLVMBuildAdd(ctx->ac.builder, args.coords[1],
1583                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY], "");
1584                                 /* fall through */
1585                         case TGSI_TEXTURE_1D:
1586                         case TGSI_TEXTURE_SHADOW1D:
1587                         case TGSI_TEXTURE_1D_ARRAY:
1588                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
1589                                 args.coords[0] =
1590                                         LLVMBuildAdd(ctx->ac.builder, args.coords[0],
1591                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX], "");
1592                                 break;
1593                                 /* texture offsets do not apply to other texture targets */
1594                         }
1595                 }
1596         }
1597
1598         if (opcode == TGSI_OPCODE_TG4) {
1599                 unsigned gather_comp = 0;
1600
1601                 /* DMASK was repurposed for GATHER4. 4 components are always
1602                  * returned and DMASK works like a swizzle - it selects
1603                  * the component to fetch. The only valid DMASK values are
1604                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
1605                  * (red,red,red,red) etc.) The ISA document doesn't mention
1606                  * this.
1607                  */
1608
1609                 /* Get the component index from src1.x for Gather4. */
1610                 if (!tgsi_is_shadow_target(target)) {
1611                         LLVMValueRef comp_imm;
1612                         struct tgsi_src_register src1 = inst->Src[1].Register;
1613
1614                         assert(src1.File == TGSI_FILE_IMMEDIATE);
1615
1616                         comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
1617                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
1618                         gather_comp = CLAMP(gather_comp, 0, 3);
1619                 }
1620
1621                 args.dmask = 1 << gather_comp;
1622         } else {
1623                 args.dmask = 0xf;
1624         }
1625
1626         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1627         args.unorm = target == TGSI_TEXTURE_RECT ||
1628                      target == TGSI_TEXTURE_SHADOWRECT;
1629         args.opcode = ac_image_sample;
1630
1631         switch (opcode) {
1632         case TGSI_OPCODE_TXF:
1633         case TGSI_OPCODE_TXF_LZ:
1634                 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
1635                               target == TGSI_TEXTURE_2D_MSAA ||
1636                               target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
1637                                       ac_image_load : ac_image_load_mip;
1638                 break;
1639         case TGSI_OPCODE_LODQ:
1640                 args.opcode = ac_image_get_lod;
1641                 break;
1642         case TGSI_OPCODE_TEX:
1643         case TGSI_OPCODE_TEX2:
1644         case TGSI_OPCODE_TXP:
1645                 if (ctx->type != PIPE_SHADER_FRAGMENT)
1646                         args.level_zero = true;
1647                 break;
1648         case TGSI_OPCODE_TEX_LZ:
1649                 args.level_zero = true;
1650                 break;
1651         case TGSI_OPCODE_TXB:
1652         case TGSI_OPCODE_TXB2:
1653                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
1654                 break;
1655         case TGSI_OPCODE_TXL:
1656         case TGSI_OPCODE_TXL2:
1657                 break;
1658         case TGSI_OPCODE_TXD:
1659                 break;
1660         case TGSI_OPCODE_TG4:
1661                 args.opcode = ac_image_gather4;
1662                 args.level_zero = true;
1663                 break;
1664         default:
1665                 assert(0);
1666                 return;
1667         }
1668
1669         /* The hardware needs special lowering for Gather4 with integer formats. */
1670         LLVMValueRef gather4_int_result_workaround = NULL;
1671
1672         if (ctx->screen->info.chip_class <= GFX8 &&
1673             opcode == TGSI_OPCODE_TG4) {
1674                 assert(inst->Texture.ReturnType != TGSI_RETURN_TYPE_UNKNOWN);
1675
1676                 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
1677                     inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
1678                         gather4_int_result_workaround =
1679                                 si_lower_gather4_integer(ctx, &args, target,
1680                                                          inst->Texture.ReturnType);
1681                 }
1682         }
1683
1684         args.attributes = AC_FUNC_ATTR_READNONE;
1685         LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
1686
1687         if (gather4_int_result_workaround) {
1688                 result = si_fix_gather4_integer_result(ctx, result,
1689                                                        inst->Texture.ReturnType,
1690                                                        gather4_int_result_workaround);
1691         }
1692
1693         emit_data->output[emit_data->chan] = result;
1694 }
1695
1696 static void si_llvm_emit_txqs(
1697         const struct lp_build_tgsi_action *action,
1698         struct lp_build_tgsi_context *bld_base,
1699         struct lp_build_emit_data *emit_data)
1700 {
1701         struct si_shader_context *ctx = si_shader_context(bld_base);
1702         LLVMValueRef rsrc;
1703
1704         tex_fetch_ptrs(bld_base, emit_data, &rsrc, NULL, NULL);
1705
1706         rsrc = LLVMBuildBitCast(ctx->ac.builder, rsrc, ctx->v8i32, "");
1707         emit_data->output[emit_data->chan] =
1708                 ac_build_image_get_sample_count(&ctx->ac, rsrc);
1709 }
1710
1711 static LLVMValueRef si_llvm_emit_fbfetch(struct si_shader_context *ctx)
1712 {
1713         struct ac_image_args args = {};
1714         LLVMValueRef ptr, image, fmask;
1715
1716         /* Ignore src0, because KHR_blend_func_extended disallows multiple render
1717          * targets.
1718          */
1719
1720         /* Load the image descriptor. */
1721         STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
1722         ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1723         ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
1724                                    ac_array_in_const32_addr_space(ctx->v8i32), "");
1725         image = ac_build_load_to_sgpr(&ctx->ac, ptr,
1726                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
1727
1728         unsigned chan = 0;
1729
1730         args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 0, 16);
1731
1732         if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1733                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 16, 16);
1734
1735         /* Get the current render target layer index. */
1736         if (ctx->shader->key.mono.u.ps.fbfetch_layered)
1737                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_ANCILLARY, 16, 11);
1738
1739         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1740                 args.coords[chan++] = si_get_sample_id(ctx);
1741
1742         if (ctx->shader->key.mono.u.ps.fbfetch_msaa &&
1743             !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
1744                 fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
1745                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
1746
1747                 ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
1748                                          ctx->shader->key.mono.u.ps.fbfetch_layered);
1749         }
1750
1751         args.opcode = ac_image_load;
1752         args.resource = image;
1753         args.dmask = 0xf;
1754         args.attributes = AC_FUNC_ATTR_READNONE;
1755
1756         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1757                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1758                         ac_image_2darraymsaa : ac_image_2dmsaa;
1759         else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1760                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1761                         ac_image_1darray : ac_image_1d;
1762         else
1763                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1764                         ac_image_2darray : ac_image_2d;
1765
1766         return ac_build_image_opcode(&ctx->ac, &args);
1767 }
1768
1769 static void si_tgsi_emit_fbfetch(const struct lp_build_tgsi_action *action,
1770                                  struct lp_build_tgsi_context *bld_base,
1771                                  struct lp_build_emit_data *emit_data)
1772 {
1773         struct si_shader_context *ctx = si_shader_context(bld_base);
1774
1775         emit_data->output[emit_data->chan] = si_llvm_emit_fbfetch(ctx);
1776 }
1777
1778 LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi)
1779 {
1780         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1781
1782         return si_llvm_emit_fbfetch(ctx);
1783 }
1784
1785 /**
1786  * Setup actions for TGSI memory opcode, including texture opcodes.
1787  */
1788 void si_shader_context_init_mem(struct si_shader_context *ctx)
1789 {
1790         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1791
1792         bld_base->op_actions[TGSI_OPCODE_TEX].emit = build_tex_intrinsic;
1793         bld_base->op_actions[TGSI_OPCODE_TEX_LZ].emit = build_tex_intrinsic;
1794         bld_base->op_actions[TGSI_OPCODE_TEX2].emit = build_tex_intrinsic;
1795         bld_base->op_actions[TGSI_OPCODE_TXB].emit = build_tex_intrinsic;
1796         bld_base->op_actions[TGSI_OPCODE_TXB2].emit = build_tex_intrinsic;
1797         bld_base->op_actions[TGSI_OPCODE_TXD].emit = build_tex_intrinsic;
1798         bld_base->op_actions[TGSI_OPCODE_TXF].emit = build_tex_intrinsic;
1799         bld_base->op_actions[TGSI_OPCODE_TXF_LZ].emit = build_tex_intrinsic;
1800         bld_base->op_actions[TGSI_OPCODE_TXL].emit = build_tex_intrinsic;
1801         bld_base->op_actions[TGSI_OPCODE_TXL2].emit = build_tex_intrinsic;
1802         bld_base->op_actions[TGSI_OPCODE_TXP].emit = build_tex_intrinsic;
1803         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = resq_emit;
1804         bld_base->op_actions[TGSI_OPCODE_TG4].emit = build_tex_intrinsic;
1805         bld_base->op_actions[TGSI_OPCODE_LODQ].emit = build_tex_intrinsic;
1806         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
1807
1808         bld_base->op_actions[TGSI_OPCODE_FBFETCH].emit = si_tgsi_emit_fbfetch;
1809
1810         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
1811         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
1812         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
1813
1814         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit;
1815         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
1816         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit;
1817         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
1818         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit;
1819         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
1820         bld_base->op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit;
1821         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
1822         bld_base->op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit;
1823         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
1824         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit;
1825         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
1826         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit;
1827         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
1828         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit;
1829         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
1830         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
1831         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
1832         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
1833         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
1834         bld_base->op_actions[TGSI_OPCODE_ATOMINC_WRAP].emit = atomic_emit;
1835         bld_base->op_actions[TGSI_OPCODE_ATOMINC_WRAP].intr_name = "inc";
1836         bld_base->op_actions[TGSI_OPCODE_ATOMDEC_WRAP].emit = atomic_emit;
1837         bld_base->op_actions[TGSI_OPCODE_ATOMDEC_WRAP].intr_name = "dec";
1838 }