src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c

   1 /*
   2  * Copyright 2017 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_shader_internal.h"
  26 #include "si_pipe.h"
  27 #include "sid.h"
  28 #include "tgsi/tgsi_build.h"
  29 #include "tgsi/tgsi_util.h"
  30 #include "ac_llvm_util.h"
  31
  32 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
  33                            struct lp_build_emit_data *emit_data,
  34                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
  35                            LLVMValueRef *fmask_ptr);
  36
  37 /**
  38  * Given a v8i32 resource descriptor for a buffer, extract the size of the
  39  * buffer in number of elements and return it as an i32.
  40  */
  41 static LLVMValueRef get_buffer_size(
  42         struct lp_build_tgsi_context *bld_base,
  43         LLVMValueRef descriptor)
  44 {
  45         struct si_shader_context *ctx = si_shader_context(bld_base);
  46         LLVMBuilderRef builder = ctx->ac.builder;
  47         LLVMValueRef size =
  48                 LLVMBuildExtractElement(builder, descriptor,
  49                                         LLVMConstInt(ctx->i32, 2, 0), "");
  50
  51         if (ctx->screen->info.chip_class == VI) {
  52                 /* On VI, the descriptor contains the size in bytes,
  53                  * but TXQ must return the size in elements.
  54                  * The stride is always non-zero for resources using TXQ.
  55                  */
  56                 LLVMValueRef stride =
  57                         LLVMBuildExtractElement(builder, descriptor,
  58                                                 ctx->i32_1, "");
  59                 stride = LLVMBuildLShr(builder, stride,
  60                                        LLVMConstInt(ctx->i32, 16, 0), "");
  61                 stride = LLVMBuildAnd(builder, stride,
  62                                       LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
  63
  64                 size = LLVMBuildUDiv(builder, size, stride, "");
  65         }
  66
  67         return size;
  68 }
  69
  70 static LLVMValueRef
  71 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
  72                          const struct tgsi_full_src_register *reg,
  73                          bool ubo)
  74 {
  75         LLVMValueRef index;
  76
  77         if (!reg->Register.Indirect) {
  78                 index = LLVMConstInt(ctx->i32, reg->Register.Index, false);
  79         } else {
  80                 index = si_get_indirect_index(ctx, &reg->Indirect,
  81                                               1, reg->Register.Index);
  82         }
  83
  84         if (ubo)
  85                 return ctx->abi.load_ubo(&ctx->abi, index);
  86         else
  87                 return ctx->abi.load_ssbo(&ctx->abi, index, false);
  88 }
  89
  90 static enum ac_image_dim
  91 ac_texture_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
  92 {
  93         switch (target) {
  94         case TGSI_TEXTURE_1D:
  95         case TGSI_TEXTURE_SHADOW1D:
  96                 if (screen->info.chip_class >= GFX9)
  97                         return ac_image_2d;
  98                 return ac_image_1d;
  99         case TGSI_TEXTURE_2D:
 100         case TGSI_TEXTURE_SHADOW2D:
 101         case TGSI_TEXTURE_RECT:
 102         case TGSI_TEXTURE_SHADOWRECT:
 103                 return ac_image_2d;
 104         case TGSI_TEXTURE_3D:
 105                 return ac_image_3d;
 106         case TGSI_TEXTURE_CUBE:
 107         case TGSI_TEXTURE_SHADOWCUBE:
 108         case TGSI_TEXTURE_CUBE_ARRAY:
 109         case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
 110                 return ac_image_cube;
 111         case TGSI_TEXTURE_1D_ARRAY:
 112         case TGSI_TEXTURE_SHADOW1D_ARRAY:
 113                 if (screen->info.chip_class >= GFX9)
 114                         return ac_image_2darray;
 115                 return ac_image_1darray;
 116         case TGSI_TEXTURE_2D_ARRAY:
 117         case TGSI_TEXTURE_SHADOW2D_ARRAY:
 118                 return ac_image_2darray;
 119         case TGSI_TEXTURE_2D_MSAA:
 120                 return ac_image_2dmsaa;
 121         case TGSI_TEXTURE_2D_ARRAY_MSAA:
 122                 return ac_image_2darraymsaa;
 123         default:
 124                 unreachable("unhandled texture type");
 125         }
 126 }
 127
 128 static enum ac_image_dim
 129 ac_image_dim_from_tgsi_target(struct si_screen *screen, enum tgsi_texture_type target)
 130 {
 131         enum ac_image_dim dim = ac_texture_dim_from_tgsi_target(screen, target);
 132
 133         /* Match the resource type set in the descriptor. */
 134         if (dim == ac_image_cube ||
 135             (screen->info.chip_class <= VI && dim == ac_image_3d))
 136                 dim = ac_image_2darray;
 137         else if (target == TGSI_TEXTURE_2D && screen->info.chip_class >= GFX9) {
 138                 /* When a single layer of a 3D texture is bound, the shader
 139                  * will refer to a 2D target, but the descriptor has a 3D type.
 140                  * Since the HW ignores BASE_ARRAY in this case, we need to
 141                  * send 3 coordinates. This doesn't hurt when the underlying
 142                  * texture is non-3D.
 143                  */
 144                 dim = ac_image_3d;
 145         }
 146
 147         return dim;
 148 }
 149
 150 /**
 151  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
 152  *
 153  * At least on Tonga, executing image stores on images with DCC enabled and
 154  * non-trivial can eventually lead to lockups. This can occur when an
 155  * application binds an image as read-only but then uses a shader that writes
 156  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
 157  * program termination) in this case, but it doesn't cost much to be a bit
 158  * nicer: disabling DCC in the shader still leads to undefined results but
 159  * avoids the lockup.
 160  */
 161 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
 162                                   LLVMValueRef rsrc)
 163 {
 164         if (ctx->screen->info.chip_class <= CIK) {
 165                 return rsrc;
 166         } else {
 167                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
 168                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
 169                 LLVMValueRef tmp;
 170
 171                 tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
 172                 tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
 173                 return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
 174         }
 175 }
 176
 177 LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
 178                                 LLVMValueRef list, LLVMValueRef index,
 179                                 enum ac_descriptor_type desc_type, bool dcc_off)
 180 {
 181         LLVMBuilderRef builder = ctx->ac.builder;
 182         LLVMValueRef rsrc;
 183
 184         if (desc_type == AC_DESC_BUFFER) {
 185                 index = LLVMBuildMul(builder, index,
 186                                      LLVMConstInt(ctx->i32, 2, 0), "");
 187                 index = LLVMBuildAdd(builder, index,
 188                                      ctx->i32_1, "");
 189                 list = LLVMBuildPointerCast(builder, list,
 190                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
 191         } else {
 192                 assert(desc_type == AC_DESC_IMAGE);
 193         }
 194
 195         rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
 196         if (desc_type == AC_DESC_IMAGE && dcc_off)
 197                 rsrc = force_dcc_off(ctx, rsrc);
 198         return rsrc;
 199 }
 200
 201 /**
 202  * Load the resource descriptor for \p image.
 203  */
 204 static void
 205 image_fetch_rsrc(
 206         struct lp_build_tgsi_context *bld_base,
 207         const struct tgsi_full_src_register *image,
 208         bool is_store, unsigned target,
 209         LLVMValueRef *rsrc)
 210 {
 211         struct si_shader_context *ctx = si_shader_context(bld_base);
 212         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
 213                                              ctx->param_samplers_and_images);
 214         LLVMValueRef index;
 215         bool dcc_off = is_store;
 216
 217         if (!image->Register.Indirect) {
 218                 const struct tgsi_shader_info *info = bld_base->info;
 219                 unsigned images_writemask = info->images_store |
 220                                             info->images_atomic;
 221
 222                 index = LLVMConstInt(ctx->i32,
 223                                      si_get_image_slot(image->Register.Index), 0);
 224
 225                 if (images_writemask & (1 << image->Register.Index))
 226                         dcc_off = true;
 227         } else {
 228                 /* From the GL_ARB_shader_image_load_store extension spec:
 229                  *
 230                  *    If a shader performs an image load, store, or atomic
 231                  *    operation using an image variable declared as an array,
 232                  *    and if the index used to select an individual element is
 233                  *    negative or greater than or equal to the size of the
 234                  *    array, the results of the operation are undefined but may
 235                  *    not lead to termination.
 236                  */
 237                 index = si_get_bounded_indirect_index(ctx, &image->Indirect,
 238                                                       image->Register.Index,
 239                                                       ctx->num_images);
 240                 index = LLVMBuildSub(ctx->ac.builder,
 241                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
 242                                      index, "");
 243         }
 244
 245         if (image->Register.File != TGSI_FILE_IMAGE) {
 246                 /* Bindless descriptors are accessible from a different pair of
 247                  * user SGPR indices.
 248                  */
 249                 rsrc_ptr = LLVMGetParam(ctx->main_fn,
 250                                         ctx->param_bindless_samplers_and_images);
 251                 index = lp_build_emit_fetch_src(bld_base, image,
 252                                                 TGSI_TYPE_UNSIGNED, 0);
 253
 254                 /* For simplicity, bindless image descriptors use fixed
 255                  * 16-dword slots for now.
 256                  */
 257                 index = LLVMBuildMul(ctx->ac.builder, index,
 258                                      LLVMConstInt(ctx->i32, 2, 0), "");
 259         }
 260
 261         *rsrc = si_load_image_desc(ctx, rsrc_ptr, index,
 262                                    target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE,
 263                                    dcc_off);
 264 }
 265
 266 static void image_fetch_coords(
 267                 struct lp_build_tgsi_context *bld_base,
 268                 const struct tgsi_full_instruction *inst,
 269                 unsigned src, LLVMValueRef desc,
 270                 LLVMValueRef *coords)
 271 {
 272         struct si_shader_context *ctx = si_shader_context(bld_base);
 273         LLVMBuilderRef builder = ctx->ac.builder;
 274         unsigned target = inst->Memory.Texture;
 275         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
 276         LLVMValueRef tmp;
 277         int chan;
 278
 279         if (target == TGSI_TEXTURE_2D_MSAA ||
 280             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
 281                 /* Need the sample index as well. */
 282                 num_coords++;
 283         }
 284
 285         for (chan = 0; chan < num_coords; ++chan) {
 286                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
 287                 tmp = ac_to_integer(&ctx->ac, tmp);
 288                 coords[chan] = tmp;
 289         }
 290
 291         if (ctx->screen->info.chip_class >= GFX9) {
 292                 /* 1D textures are allocated and used as 2D on GFX9. */
 293                 if (target == TGSI_TEXTURE_1D) {
 294                         coords[1] = ctx->i32_0;
 295                 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
 296                         coords[2] = coords[1];
 297                         coords[1] = ctx->i32_0;
 298                 } else if (target == TGSI_TEXTURE_2D) {
 299                         /* The hw can't bind a slice of a 3D image as a 2D
 300                          * image, because it ignores BASE_ARRAY if the target
 301                          * is 3D. The workaround is to read BASE_ARRAY and set
 302                          * it as the 3rd address operand for all 2D images.
 303                          */
 304                         LLVMValueRef first_layer, const5, mask;
 305
 306                         const5 = LLVMConstInt(ctx->i32, 5, 0);
 307                         mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
 308                         first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
 309                         first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
 310
 311                         coords[2] = first_layer;
 312                 }
 313         }
 314 }
 315
 316 /**
 317  * Append the resource and indexing arguments for buffer intrinsics.
 318  *
 319  * \param rsrc the v4i32 buffer resource
 320  * \param index index into the buffer (stride-based)
 321  * \param offset byte offset into the buffer
 322  */
 323 static void buffer_append_args(
 324                 struct si_shader_context *ctx,
 325                 struct lp_build_emit_data *emit_data,
 326                 LLVMValueRef rsrc,
 327                 LLVMValueRef index,
 328                 LLVMValueRef offset,
 329                 bool atomic,
 330                 bool force_glc)
 331 {
 332         const struct tgsi_full_instruction *inst = emit_data->inst;
 333         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
 334         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
 335
 336         emit_data->args[emit_data->arg_count++] = rsrc;
 337         emit_data->args[emit_data->arg_count++] = index; /* vindex */
 338         emit_data->args[emit_data->arg_count++] = offset; /* voffset */
 339         if (!atomic) {
 340                 emit_data->args[emit_data->arg_count++] =
 341                         force_glc ||
 342                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
 343                         i1true : i1false; /* glc */
 344         }
 345         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
 346 }
 347
 348 static void load_emit_buffer(struct si_shader_context *ctx,
 349                              struct lp_build_emit_data *emit_data,
 350                              bool can_speculate, bool allow_smem)
 351 {
 352         const struct tgsi_full_instruction *inst = emit_data->inst;
 353         uint writemask = inst->Dst[0].Register.WriteMask;
 354         uint count = util_last_bit(writemask);
 355         LLVMValueRef *args = emit_data->args;
 356
 357         /* Don't use SMEM for shader buffer loads, because LLVM doesn't
 358          * select SMEM for SI.load.const with a non-constant offset, and
 359          * constant offsets practically don't exist with shader buffers.
 360          *
 361          * Also, SI.load.const doesn't use inst_offset when it's lowered
 362          * to VMEM, so we just end up with more VALU instructions in the end
 363          * and no benefit.
 364          *
 365          * TODO: Remove this line once LLVM can select SMEM with a non-constant
 366          *       offset, and can derive inst_offset when VMEM is selected.
 367          *       After that, si_memory_barrier should invalidate sL1 for shader
 368          *       buffers.
 369          */
 370
 371         assert(LLVMConstIntGetZExtValue(args[1]) == 0); /* vindex */
 372         emit_data->output[emit_data->chan] =
 373                 ac_build_buffer_load(&ctx->ac, args[0], count, NULL,
 374                                      args[2], NULL, 0,
 375                                      LLVMConstIntGetZExtValue(args[3]),
 376                                      LLVMConstIntGetZExtValue(args[4]),
 377                                      can_speculate, allow_smem);
 378 }
 379
 380 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
 381                                    const struct tgsi_full_instruction *inst,
 382                                    LLVMTypeRef type, int arg)
 383 {
 384         LLVMBuilderRef builder = ctx->ac.builder;
 385         LLVMValueRef offset, ptr;
 386         int addr_space;
 387
 388         offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
 389         offset = ac_to_integer(&ctx->ac, offset);
 390
 391         ptr = ctx->ac.lds;
 392         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
 393         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
 394         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
 395
 396         return ptr;
 397 }
 398
 399 static void load_emit_memory(
 400                 struct si_shader_context *ctx,
 401                 struct lp_build_emit_data *emit_data)
 402 {
 403         const struct tgsi_full_instruction *inst = emit_data->inst;
 404         unsigned writemask = inst->Dst[0].Register.WriteMask;
 405         LLVMValueRef channels[4], ptr, derived_ptr, index;
 406         int chan;
 407
 408         ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
 409
 410         for (chan = 0; chan < 4; ++chan) {
 411                 if (!(writemask & (1 << chan))) {
 412                         channels[chan] = LLVMGetUndef(ctx->f32);
 413                         continue;
 414                 }
 415
 416                 index = LLVMConstInt(ctx->i32, chan, 0);
 417                 derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
 418                 channels[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
 419         }
 420         emit_data->output[emit_data->chan] = ac_build_gather_values(&ctx->ac, channels, 4);
 421 }
 422
 423 /**
 424  * Return true if the memory accessed by a LOAD or STORE instruction is
 425  * read-only or write-only, respectively.
 426  *
 427  * \param shader_buffers_reverse_access_mask
 428  *      For LOAD, set this to (store | atomic) slot usage in the shader.
 429  *      For STORE, set this to (load | atomic) slot usage in the shader.
 430  * \param images_reverse_access_mask  Same as above, but for images.
 431  */
 432 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
 433                                   const struct tgsi_shader_info *info,
 434                                   unsigned shader_buffers_reverse_access_mask,
 435                                   unsigned images_reverse_access_mask)
 436 {
 437         /* RESTRICT means NOALIAS.
 438          * If there are no writes, we can assume the accessed memory is read-only.
 439          * If there are no reads, we can assume the accessed memory is write-only.
 440          */
 441         if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
 442                 unsigned reverse_access_mask;
 443
 444                 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 445                         reverse_access_mask = shader_buffers_reverse_access_mask;
 446                 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 447                         reverse_access_mask = info->images_buffers &
 448                                               images_reverse_access_mask;
 449                 } else {
 450                         reverse_access_mask = ~info->images_buffers &
 451                                               images_reverse_access_mask;
 452                 }
 453
 454                 if (inst->Src[0].Register.Indirect) {
 455                         if (!reverse_access_mask)
 456                                 return true;
 457                 } else {
 458                         if (!(reverse_access_mask &
 459                               (1u << inst->Src[0].Register.Index)))
 460                                 return true;
 461                 }
 462         }
 463
 464         /* If there are no buffer writes (for both shader buffers & image
 465          * buffers), it implies that buffer memory is read-only.
 466          * If there are no buffer reads (for both shader buffers & image
 467          * buffers), it implies that buffer memory is write-only.
 468          *
 469          * Same for the case when there are no writes/reads for non-buffer
 470          * images.
 471          */
 472         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 473             (inst->Memory.Texture == TGSI_TEXTURE_BUFFER &&
 474              (inst->Src[0].Register.File == TGSI_FILE_IMAGE ||
 475               tgsi_is_bindless_image_file(inst->Src[0].Register.File)))) {
 476                 if (!shader_buffers_reverse_access_mask &&
 477                     !(info->images_buffers & images_reverse_access_mask))
 478                         return true;
 479         } else {
 480                 if (!(~info->images_buffers & images_reverse_access_mask))
 481                         return true;
 482         }
 483         return false;
 484 }
 485
 486 static void load_emit(
 487                 const struct lp_build_tgsi_action *action,
 488                 struct lp_build_tgsi_context *bld_base,
 489                 struct lp_build_emit_data *emit_data)
 490 {
 491         struct si_shader_context *ctx = si_shader_context(bld_base);
 492         const struct tgsi_full_instruction * inst = emit_data->inst;
 493         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 494         bool can_speculate = false;
 495
 496         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 497                 load_emit_memory(ctx, emit_data);
 498                 return;
 499         }
 500
 501         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 502             inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 503                 LLVMValueRef offset, tmp, rsrc;
 504
 505                 bool ubo = inst->Src[0].Register.File == TGSI_FILE_CONSTBUF;
 506                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], ubo);
 507
 508                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
 509                 offset = ac_to_integer(&ctx->ac, tmp);
 510
 511                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
 512                                    offset, false, false);
 513         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE ||
 514                    tgsi_is_bindless_image_file(inst->Src[0].Register.File)) {
 515                 LLVMValueRef rsrc;
 516                 unsigned target = inst->Memory.Texture;
 517
 518                 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
 519                 image_fetch_coords(bld_base, inst, 1, rsrc, &emit_data->args[1]);
 520
 521                 if (target == TGSI_TEXTURE_BUFFER) {
 522                         buffer_append_args(ctx, emit_data, rsrc, emit_data->args[1],
 523                                            ctx->i32_0, false, false);
 524                 } else {
 525                         emit_data->args[0] = rsrc;
 526                 }
 527         }
 528
 529         if (inst->Src[0].Register.File == TGSI_FILE_CONSTBUF) {
 530                 load_emit_buffer(ctx, emit_data, true, true);
 531                 return;
 532         }
 533
 534         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 535                 ac_build_waitcnt(&ctx->ac, VM_CNT);
 536
 537         can_speculate = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
 538                           is_oneway_access_only(inst, info,
 539                                                 info->shader_buffers_store |
 540                                                 info->shader_buffers_atomic,
 541                                                 info->images_store |
 542                                                 info->images_atomic);
 543
 544         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 545                 load_emit_buffer(ctx, emit_data, can_speculate, false);
 546                 return;
 547         }
 548
 549         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 550                 unsigned num_channels = util_last_bit(inst->Dst[0].Register.WriteMask);
 551                 LLVMValueRef result =
 552                         ac_build_buffer_load_format(&ctx->ac,
 553                                                     emit_data->args[0],
 554                                                     emit_data->args[1],
 555                                                     emit_data->args[2],
 556                                                     num_channels,
 557                                                     LLVMConstIntGetZExtValue(emit_data->args[3]),
 558                                                     can_speculate);
 559                 emit_data->output[emit_data->chan] =
 560                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
 561         } else {
 562                 struct ac_image_args args = {};
 563                 args.opcode = ac_image_load;
 564                 args.resource = emit_data->args[0];
 565                 memcpy(args.coords, &emit_data->args[1], sizeof(args.coords));
 566                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 567                 if (inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))
 568                         args.cache_policy = ac_glc;
 569                 args.attributes = ac_get_load_intr_attribs(can_speculate);
 570                 args.dmask = 0xf;
 571
 572                 emit_data->output[emit_data->chan] =
 573                         ac_build_image_opcode(&ctx->ac, &args);
 574         }
 575 }
 576
 577 static void store_emit_buffer(
 578                 struct si_shader_context *ctx,
 579                 struct lp_build_emit_data *emit_data,
 580                 bool writeonly_memory)
 581 {
 582         const struct tgsi_full_instruction *inst = emit_data->inst;
 583         LLVMBuilderRef builder = ctx->ac.builder;
 584         LLVMValueRef base_data = emit_data->args[0];
 585         LLVMValueRef base_offset = emit_data->args[3];
 586         unsigned writemask = inst->Dst[0].Register.WriteMask;
 587
 588         /* If this is write-only, don't keep data in L1 to prevent
 589          * evicting L1 cache lines that may be needed by other
 590          * instructions.
 591          */
 592         if (writeonly_memory)
 593                 emit_data->args[4] = LLVMConstInt(ctx->i1, 1, 0); /* GLC = 1 */
 594
 595         while (writemask) {
 596                 int start, count;
 597                 const char *intrinsic_name;
 598                 LLVMValueRef data;
 599                 LLVMValueRef offset;
 600                 LLVMValueRef tmp;
 601
 602                 u_bit_scan_consecutive_range(&writemask, &start, &count);
 603
 604                 /* Due to an LLVM limitation, split 3-element writes
 605                  * into a 2-element and a 1-element write. */
 606                 if (count == 3) {
 607                         writemask |= 1 << (start + 2);
 608                         count = 2;
 609                 }
 610
 611                 if (count == 4) {
 612                         data = base_data;
 613                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
 614                 } else if (count == 2) {
 615                         LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
 616
 617                         tmp = LLVMBuildExtractElement(
 618                                 builder, base_data,
 619                                 LLVMConstInt(ctx->i32, start, 0), "");
 620                         data = LLVMBuildInsertElement(
 621                                 builder, LLVMGetUndef(v2f32), tmp,
 622                                 ctx->i32_0, "");
 623
 624                         tmp = LLVMBuildExtractElement(
 625                                 builder, base_data,
 626                                 LLVMConstInt(ctx->i32, start + 1, 0), "");
 627                         data = LLVMBuildInsertElement(
 628                                 builder, data, tmp, ctx->i32_1, "");
 629
 630                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
 631                 } else {
 632                         assert(count == 1);
 633                         data = LLVMBuildExtractElement(
 634                                 builder, base_data,
 635                                 LLVMConstInt(ctx->i32, start, 0), "");
 636                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
 637                 }
 638
 639                 offset = base_offset;
 640                 if (start != 0) {
 641                         offset = LLVMBuildAdd(
 642                                 builder, offset,
 643                                 LLVMConstInt(ctx->i32, start * 4, 0), "");
 644                 }
 645
 646                 emit_data->args[0] = data;
 647                 emit_data->args[3] = offset;
 648
 649                 ac_build_intrinsic(
 650                         &ctx->ac, intrinsic_name, ctx->voidt,
 651                         emit_data->args, emit_data->arg_count,
 652                         ac_get_store_intr_attribs(writeonly_memory));
 653         }
 654 }
 655
 656 static void store_emit_memory(
 657                 struct si_shader_context *ctx,
 658                 struct lp_build_emit_data *emit_data)
 659 {
 660         const struct tgsi_full_instruction *inst = emit_data->inst;
 661         LLVMBuilderRef builder = ctx->ac.builder;
 662         unsigned writemask = inst->Dst[0].Register.WriteMask;
 663         LLVMValueRef ptr, derived_ptr, data, index;
 664         int chan;
 665
 666         ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
 667
 668         for (chan = 0; chan < 4; ++chan) {
 669                 if (!(writemask & (1 << chan))) {
 670                         continue;
 671                 }
 672                 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
 673                 index = LLVMConstInt(ctx->i32, chan, 0);
 674                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
 675                 LLVMBuildStore(builder, data, derived_ptr);
 676         }
 677 }
 678
 679 static void store_emit(
 680                 const struct lp_build_tgsi_action *action,
 681                 struct lp_build_tgsi_context *bld_base,
 682                 struct lp_build_emit_data *emit_data)
 683 {
 684         struct si_shader_context *ctx = si_shader_context(bld_base);
 685         const struct tgsi_full_instruction * inst = emit_data->inst;
 686         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 687         struct tgsi_full_src_register resource_reg =
 688                 tgsi_full_src_register_from_dst(&inst->Dst[0]);
 689         unsigned target = inst->Memory.Texture;
 690         bool writeonly_memory = false;
 691         LLVMValueRef chans[4], rsrc;
 692
 693         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
 694                 store_emit_memory(ctx, emit_data);
 695                 return;
 696         }
 697
 698         for (unsigned chan = 0; chan < 4; ++chan)
 699                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
 700
 701         emit_data->args[emit_data->arg_count++] =
 702                 ac_build_gather_values(&ctx->ac, chans, 4);
 703
 704         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 705                 LLVMValueRef offset, tmp;
 706
 707                 rsrc = shader_buffer_fetch_rsrc(ctx, &resource_reg, false);
 708
 709                 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
 710                 offset = ac_to_integer(&ctx->ac, tmp);
 711
 712                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
 713                                    offset, false, false);
 714         } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE ||
 715                    tgsi_is_bindless_image_file(inst->Dst[0].Register.File)) {
 716                 /* 8bit/16bit TC L1 write corruption bug on SI.
 717                  * All store opcodes not aligned to a dword are affected.
 718                  *
 719                  * The only way to get unaligned stores in radeonsi is through
 720                  * shader images.
 721                  */
 722                 bool force_glc = ctx->screen->info.chip_class == SI;
 723
 724                 image_fetch_rsrc(bld_base, &resource_reg, true, target, &rsrc);
 725                 image_fetch_coords(bld_base, inst, 0, rsrc, &emit_data->args[2]);
 726
 727                 if (target == TGSI_TEXTURE_BUFFER) {
 728                         buffer_append_args(ctx, emit_data, rsrc, emit_data->args[2],
 729                                            ctx->i32_0, false, force_glc);
 730                 } else {
 731                         emit_data->args[1] = rsrc;
 732                 }
 733         }
 734
 735         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 736                 ac_build_waitcnt(&ctx->ac, VM_CNT);
 737
 738         writeonly_memory = is_oneway_access_only(inst, info,
 739                                                  info->shader_buffers_load |
 740                                                  info->shader_buffers_atomic,
 741                                                  info->images_load |
 742                                                  info->images_atomic);
 743
 744         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
 745                 store_emit_buffer(ctx, emit_data, writeonly_memory);
 746                 return;
 747         }
 748
 749         if (target == TGSI_TEXTURE_BUFFER) {
 750                 /* If this is write-only, don't keep data in L1 to prevent
 751                  * evicting L1 cache lines that may be needed by other
 752                  * instructions.
 753                  */
 754                 if (writeonly_memory)
 755                         emit_data->args[4] = LLVMConstInt(ctx->i1, 1, 0); /* GLC = 1 */
 756
 757                 emit_data->output[emit_data->chan] = ac_build_intrinsic(
 758                         &ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32",
 759                         ctx->voidt, emit_data->args,
 760                         emit_data->arg_count,
 761                         ac_get_store_intr_attribs(writeonly_memory));
 762         } else {
 763                 struct ac_image_args args = {};
 764                 args.opcode = ac_image_store;
 765                 args.data[0] = emit_data->args[0];
 766                 args.resource = emit_data->args[1];
 767                 memcpy(args.coords, &emit_data->args[2], sizeof(args.coords));
 768                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 769                 args.attributes = ac_get_store_intr_attribs(writeonly_memory);
 770                 args.dmask = 0xf;
 771
 772                 /* Workaround for 8bit/16bit TC L1 write corruption bug on SI.
 773                  * All store opcodes not aligned to a dword are affected.
 774                  */
 775                 if (ctx->screen->info.chip_class == SI ||
 776                     /* If this is write-only, don't keep data in L1 to prevent
 777                      * evicting L1 cache lines that may be needed by other
 778                      * instructions. */
 779                     writeonly_memory ||
 780                     inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))
 781                         args.cache_policy = ac_glc;
 782
 783                 emit_data->output[emit_data->chan] =
 784                         ac_build_image_opcode(&ctx->ac, &args);
 785         }
 786 }
 787
 788 static void atomic_emit_memory(struct si_shader_context *ctx,
 789                                struct lp_build_emit_data *emit_data) {
 790         LLVMBuilderRef builder = ctx->ac.builder;
 791         const struct tgsi_full_instruction * inst = emit_data->inst;
 792         LLVMValueRef ptr, result, arg;
 793
 794         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
 795
 796         arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
 797         arg = ac_to_integer(&ctx->ac, arg);
 798
 799         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 800                 LLVMValueRef new_data;
 801                 new_data = lp_build_emit_fetch(&ctx->bld_base,
 802                                                inst, 3, 0);
 803
 804                 new_data = ac_to_integer(&ctx->ac, new_data);
 805
 806                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
 807                                        LLVMAtomicOrderingSequentiallyConsistent,
 808                                        LLVMAtomicOrderingSequentiallyConsistent,
 809                                        false);
 810
 811                 result = LLVMBuildExtractValue(builder, result, 0, "");
 812         } else {
 813                 LLVMAtomicRMWBinOp op;
 814
 815                 switch(inst->Instruction.Opcode) {
 816                         case TGSI_OPCODE_ATOMUADD:
 817                                 op = LLVMAtomicRMWBinOpAdd;
 818                                 break;
 819                         case TGSI_OPCODE_ATOMXCHG:
 820                                 op = LLVMAtomicRMWBinOpXchg;
 821                                 break;
 822                         case TGSI_OPCODE_ATOMAND:
 823                                 op = LLVMAtomicRMWBinOpAnd;
 824                                 break;
 825                         case TGSI_OPCODE_ATOMOR:
 826                                 op = LLVMAtomicRMWBinOpOr;
 827                                 break;
 828                         case TGSI_OPCODE_ATOMXOR:
 829                                 op = LLVMAtomicRMWBinOpXor;
 830                                 break;
 831                         case TGSI_OPCODE_ATOMUMIN:
 832                                 op = LLVMAtomicRMWBinOpUMin;
 833                                 break;
 834                         case TGSI_OPCODE_ATOMUMAX:
 835                                 op = LLVMAtomicRMWBinOpUMax;
 836                                 break;
 837                         case TGSI_OPCODE_ATOMIMIN:
 838                                 op = LLVMAtomicRMWBinOpMin;
 839                                 break;
 840                         case TGSI_OPCODE_ATOMIMAX:
 841                                 op = LLVMAtomicRMWBinOpMax;
 842                                 break;
 843                         default:
 844                                 unreachable("unknown atomic opcode");
 845                 }
 846
 847                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
 848                                        LLVMAtomicOrderingSequentiallyConsistent,
 849                                        false);
 850         }
 851         emit_data->output[emit_data->chan] =
 852                 LLVMBuildBitCast(builder, result, ctx->f32, "");
 853 }
 854
 855 static void atomic_emit(
 856                 const struct lp_build_tgsi_action *action,
 857                 struct lp_build_tgsi_context *bld_base,
 858                 struct lp_build_emit_data *emit_data)
 859 {
 860         struct si_shader_context *ctx = si_shader_context(bld_base);
 861         const struct tgsi_full_instruction * inst = emit_data->inst;
 862
 863         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
 864                 atomic_emit_memory(ctx, emit_data);
 865                 return;
 866         }
 867
 868         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 869                 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
 870                  * of arguments, which is reversed relative to TGSI (and GLSL)
 871                  */
 872                 emit_data->args[emit_data->arg_count++] =
 873                         ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 3, 0));
 874         }
 875
 876         emit_data->args[emit_data->arg_count++] =
 877                 ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
 878
 879         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 880                 LLVMValueRef rsrc, offset;
 881
 882                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);
 883                 offset = ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 1, 0));
 884
 885                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
 886                                    offset, true, false);
 887         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE ||
 888                    tgsi_is_bindless_image_file(inst->Src[0].Register.File)) {
 889                 unsigned target = inst->Memory.Texture;
 890                 LLVMValueRef rsrc;
 891
 892                 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
 893                 image_fetch_coords(bld_base, inst, 1, rsrc,
 894                                    &emit_data->args[emit_data->arg_count + 1]);
 895
 896                 if (target == TGSI_TEXTURE_BUFFER) {
 897                         buffer_append_args(ctx, emit_data, rsrc,
 898                                            emit_data->args[emit_data->arg_count + 1],
 899                                            ctx->i32_0, true, false);
 900                 } else {
 901                         emit_data->args[emit_data->arg_count] = rsrc;
 902                 }
 903         }
 904
 905         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
 906             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 907                 char intrinsic_name[40];
 908                 snprintf(intrinsic_name, sizeof(intrinsic_name),
 909                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
 910                 LLVMValueRef tmp = ac_build_intrinsic(
 911                         &ctx->ac, intrinsic_name, ctx->i32,
 912                         emit_data->args, emit_data->arg_count, 0);
 913                 emit_data->output[emit_data->chan] = ac_to_float(&ctx->ac, tmp);
 914         } else {
 915                 unsigned num_data = inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS ? 2 : 1;
 916                 struct ac_image_args args = {};
 917
 918                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
 919                         args.opcode = ac_image_atomic_cmpswap;
 920                 } else {
 921                         args.opcode = ac_image_atomic;
 922                         switch (inst->Instruction.Opcode) {
 923                         case TGSI_OPCODE_ATOMXCHG: args.atomic = ac_atomic_swap; break;
 924                         case TGSI_OPCODE_ATOMUADD: args.atomic = ac_atomic_add; break;
 925                         case TGSI_OPCODE_ATOMAND: args.atomic = ac_atomic_and; break;
 926                         case TGSI_OPCODE_ATOMOR: args.atomic = ac_atomic_or; break;
 927                         case TGSI_OPCODE_ATOMXOR: args.atomic = ac_atomic_xor; break;
 928                         case TGSI_OPCODE_ATOMUMIN: args.atomic = ac_atomic_umin; break;
 929                         case TGSI_OPCODE_ATOMUMAX: args.atomic = ac_atomic_umax; break;
 930                         case TGSI_OPCODE_ATOMIMIN: args.atomic = ac_atomic_smin; break;
 931                         case TGSI_OPCODE_ATOMIMAX: args.atomic = ac_atomic_smax; break;
 932                         default: unreachable("unhandled image atomic");
 933                         }
 934                 }
 935
 936                 for (unsigned i = 0; i < num_data; ++i)
 937                         args.data[i] = emit_data->args[i];
 938
 939                 args.resource = emit_data->args[num_data];
 940                 memcpy(args.coords, &emit_data->args[num_data + 1], sizeof(args.coords));
 941                 args.dim = ac_image_dim_from_tgsi_target(ctx->screen, inst->Memory.Texture);
 942
 943                 emit_data->output[emit_data->chan] =
 944                         ac_to_float(&ctx->ac, ac_build_image_opcode(&ctx->ac, &args));
 945         }
 946 }
 947
 948 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
 949                                 unsigned target, LLVMValueRef out)
 950 {
 951         LLVMBuilderRef builder = ctx->ac.builder;
 952
 953         /* 1D textures are allocated and used as 2D on GFX9. */
 954         if (ctx->screen->info.chip_class >= GFX9 &&
 955             (target == TGSI_TEXTURE_1D_ARRAY ||
 956              target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
 957                 LLVMValueRef layers =
 958                         LLVMBuildExtractElement(builder, out,
 959                                                 LLVMConstInt(ctx->i32, 2, 0), "");
 960                 out = LLVMBuildInsertElement(builder, out, layers,
 961                                              ctx->i32_1, "");
 962         }
 963
 964         /* Divide the number of layers by 6 to get the number of cubes. */
 965         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 966             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
 967                 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
 968
 969                 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
 970                 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
 971
 972                 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
 973         }
 974         return out;
 975 }
 976
 977 static void resq_emit(
 978                 const struct lp_build_tgsi_action *action,
 979                 struct lp_build_tgsi_context *bld_base,
 980                 struct lp_build_emit_data *emit_data)
 981 {
 982         struct si_shader_context *ctx = si_shader_context(bld_base);
 983         LLVMBuilderRef builder = ctx->ac.builder;
 984         const struct tgsi_full_instruction *inst = emit_data->inst;
 985         const struct tgsi_full_src_register *reg =
 986                 &inst->Src[inst->Instruction.Opcode == TGSI_OPCODE_TXQ ? 1 : 0];
 987
 988         if (reg->Register.File == TGSI_FILE_BUFFER) {
 989                 LLVMValueRef rsrc = shader_buffer_fetch_rsrc(ctx, reg, false);
 990
 991                 emit_data->output[emit_data->chan] =
 992                         LLVMBuildExtractElement(builder, rsrc,
 993                                                 LLVMConstInt(ctx->i32, 2, 0), "");
 994                 return;
 995         }
 996
 997         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
 998             inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
 999                 LLVMValueRef rsrc;
1000
1001                 tex_fetch_ptrs(bld_base, emit_data, &rsrc, NULL, NULL);
1002                 /* Read the size from the buffer descriptor directly. */
1003                 emit_data->output[emit_data->chan] =
1004                         get_buffer_size(bld_base, rsrc);
1005                 return;
1006         }
1007
1008         if (inst->Instruction.Opcode == TGSI_OPCODE_RESQ &&
1009             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
1010                 LLVMValueRef rsrc;
1011
1012                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture, &rsrc);
1013                 emit_data->output[emit_data->chan] =
1014                         get_buffer_size(bld_base, rsrc);
1015                 return;
1016         }
1017
1018         unsigned target;
1019
1020         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1021                 target = inst->Texture.Texture;
1022         } else {
1023                 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
1024                         target = TGSI_TEXTURE_2D_ARRAY;
1025                 else
1026                         target = inst->Memory.Texture;
1027         }
1028
1029         struct ac_image_args args = {};
1030         args.opcode = ac_image_get_resinfo;
1031         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1032         args.dmask = 0xf;
1033
1034         if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
1035                 tex_fetch_ptrs(bld_base, emit_data, &args.resource, NULL, NULL);
1036                 args.lod = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1037         } else {
1038                 image_fetch_rsrc(bld_base, reg, false, target, &args.resource);
1039                 args.lod = ctx->i32_0;
1040         }
1041
1042         emit_data->output[emit_data->chan] =
1043                 fix_resinfo(ctx, target, ac_build_image_opcode(&ctx->ac, &args));
1044 }
1045
1046 /**
1047  * Load an image view, fmask view. or sampler state descriptor.
1048  */
1049 LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
1050                                   LLVMValueRef list, LLVMValueRef index,
1051                                   enum ac_descriptor_type type)
1052 {
1053         LLVMBuilderRef builder = ctx->ac.builder;
1054
1055         switch (type) {
1056         case AC_DESC_IMAGE:
1057                 /* The image is at [0:7]. */
1058                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1059                 break;
1060         case AC_DESC_BUFFER:
1061                 /* The buffer is in [4:7]. */
1062                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
1063                 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
1064                 list = LLVMBuildPointerCast(builder, list,
1065                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1066                 break;
1067         case AC_DESC_FMASK:
1068                 /* The FMASK is at [8:15]. */
1069                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
1070                 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
1071                 break;
1072         case AC_DESC_SAMPLER:
1073                 /* The sampler state is at [12:15]. */
1074                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
1075                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
1076                 list = LLVMBuildPointerCast(builder, list,
1077                                             ac_array_in_const32_addr_space(ctx->v4i32), "");
1078                 break;
1079         }
1080
1081         return ac_build_load_to_sgpr(&ctx->ac, list, index);
1082 }
1083
1084 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
1085  *
1086  * SI-CI:
1087  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
1088  *   filtering manually. The driver sets img7 to a mask clearing
1089  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
1090  *     s_and_b32 samp0, samp0, img7
1091  *
1092  * VI:
1093  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
1094  */
1095 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
1096                                            LLVMValueRef res, LLVMValueRef samp)
1097 {
1098         LLVMValueRef img7, samp0;
1099
1100         if (ctx->screen->info.chip_class >= VI)
1101                 return samp;
1102
1103         img7 = LLVMBuildExtractElement(ctx->ac.builder, res,
1104                                        LLVMConstInt(ctx->i32, 7, 0), "");
1105         samp0 = LLVMBuildExtractElement(ctx->ac.builder, samp,
1106                                         ctx->i32_0, "");
1107         samp0 = LLVMBuildAnd(ctx->ac.builder, samp0, img7, "");
1108         return LLVMBuildInsertElement(ctx->ac.builder, samp, samp0,
1109                                       ctx->i32_0, "");
1110 }
1111
1112 static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
1113                            struct lp_build_emit_data *emit_data,
1114                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
1115                            LLVMValueRef *fmask_ptr)
1116 {
1117         struct si_shader_context *ctx = si_shader_context(bld_base);
1118         LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers_and_images);
1119         const struct tgsi_full_instruction *inst = emit_data->inst;
1120         const struct tgsi_full_src_register *reg;
1121         unsigned target = inst->Texture.Texture;
1122         unsigned sampler_src;
1123         LLVMValueRef index;
1124
1125         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
1126         reg = &emit_data->inst->Src[sampler_src];
1127
1128         if (reg->Register.Indirect) {
1129                 index = si_get_bounded_indirect_index(ctx,
1130                                                       &reg->Indirect,
1131                                                       reg->Register.Index,
1132                                                       ctx->num_samplers);
1133                 index = LLVMBuildAdd(ctx->ac.builder, index,
1134                                      LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
1135         } else {
1136                 index = LLVMConstInt(ctx->i32,
1137                                      si_get_sampler_slot(reg->Register.Index), 0);
1138         }
1139
1140         if (reg->Register.File != TGSI_FILE_SAMPLER) {
1141                 /* Bindless descriptors are accessible from a different pair of
1142                  * user SGPR indices.
1143                  */
1144                 list = LLVMGetParam(ctx->main_fn,
1145                                     ctx->param_bindless_samplers_and_images);
1146                 index = lp_build_emit_fetch_src(bld_base, reg,
1147                                                 TGSI_TYPE_UNSIGNED, 0);
1148         }
1149
1150         if (target == TGSI_TEXTURE_BUFFER)
1151                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER);
1152         else
1153                 *res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE);
1154
1155         if (samp_ptr)
1156                 *samp_ptr = NULL;
1157         if (fmask_ptr)
1158                 *fmask_ptr = NULL;
1159
1160         if (target == TGSI_TEXTURE_2D_MSAA ||
1161             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1162                 if (fmask_ptr)
1163                         *fmask_ptr = si_load_sampler_desc(ctx, list, index,
1164                                                           AC_DESC_FMASK);
1165         } else if (target != TGSI_TEXTURE_BUFFER) {
1166                 if (samp_ptr) {
1167                         *samp_ptr = si_load_sampler_desc(ctx, list, index,
1168                                                          AC_DESC_SAMPLER);
1169                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
1170                 }
1171         }
1172 }
1173
1174 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
1175  * incorrectly forces nearest filtering if the texture format is integer.
1176  * The only effect it has on Gather4, which always returns 4 texels for
1177  * bilinear filtering, is that the final coordinates are off by 0.5 of
1178  * the texel size.
1179  *
1180  * The workaround is to subtract 0.5 from the unnormalized coordinates,
1181  * or (0.5 / size) from the normalized coordinates.
1182  *
1183  * However, cube textures with 8_8_8_8 data formats require a different
1184  * workaround of overriding the num format to USCALED/SSCALED. This would lose
1185  * precision in 32-bit data formats, so it needs to be applied dynamically at
1186  * runtime. In this case, return an i1 value that indicates whether the
1187  * descriptor was overridden (and hence a fixup of the sampler result is needed).
1188  */
1189 static LLVMValueRef
1190 si_lower_gather4_integer(struct si_shader_context *ctx,
1191                          struct ac_image_args *args,
1192                          unsigned target,
1193                          enum tgsi_return_type return_type)
1194 {
1195         LLVMBuilderRef builder = ctx->ac.builder;
1196         LLVMValueRef wa_8888 = NULL;
1197         LLVMValueRef half_texel[2];
1198
1199         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1200                return_type == TGSI_RETURN_TYPE_UINT);
1201
1202         if (target == TGSI_TEXTURE_CUBE ||
1203             target == TGSI_TEXTURE_CUBE_ARRAY) {
1204                 LLVMValueRef formats;
1205                 LLVMValueRef data_format;
1206                 LLVMValueRef wa_formats;
1207
1208                 formats = LLVMBuildExtractElement(builder, args->resource, ctx->i32_1, "");
1209
1210                 data_format = LLVMBuildLShr(builder, formats,
1211                                             LLVMConstInt(ctx->i32, 20, false), "");
1212                 data_format = LLVMBuildAnd(builder, data_format,
1213                                            LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
1214                 wa_8888 = LLVMBuildICmp(
1215                         builder, LLVMIntEQ, data_format,
1216                         LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false),
1217                         "");
1218
1219                 uint32_t wa_num_format =
1220                         return_type == TGSI_RETURN_TYPE_UINT ?
1221                         S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_USCALED) :
1222                         S_008F14_NUM_FORMAT_GFX6(V_008F14_IMG_NUM_FORMAT_SSCALED);
1223                 wa_formats = LLVMBuildAnd(builder, formats,
1224                                           LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false),
1225                                           "");
1226                 wa_formats = LLVMBuildOr(builder, wa_formats,
1227                                         LLVMConstInt(ctx->i32, wa_num_format, false), "");
1228
1229                 formats = LLVMBuildSelect(builder, wa_8888, wa_formats, formats, "");
1230                 args->resource = LLVMBuildInsertElement(
1231                         builder, args->resource, formats, ctx->i32_1, "");
1232         }
1233
1234         if (target == TGSI_TEXTURE_RECT ||
1235             target == TGSI_TEXTURE_SHADOWRECT) {
1236                 assert(!wa_8888);
1237                 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
1238         } else {
1239                 struct ac_image_args resinfo = {};
1240                 struct lp_build_if_state if_ctx;
1241
1242                 if (wa_8888) {
1243                         /* Skip the texture size query entirely if we don't need it. */
1244                         lp_build_if(&if_ctx, &ctx->gallivm, LLVMBuildNot(builder, wa_8888, ""));
1245                 }
1246
1247                 /* Query the texture size. */
1248                 resinfo.opcode = ac_image_get_resinfo;
1249                 resinfo.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1250                 resinfo.resource = args->resource;
1251                 resinfo.sampler = args->sampler;
1252                 resinfo.lod = ctx->ac.i32_0;
1253                 resinfo.dmask = 0xf;
1254
1255                 LLVMValueRef texsize =
1256                         fix_resinfo(ctx, target,
1257                                     ac_build_image_opcode(&ctx->ac, &resinfo));
1258
1259                 /* Compute -0.5 / size. */
1260                 for (unsigned c = 0; c < 2; c++) {
1261                         half_texel[c] =
1262                                 LLVMBuildExtractElement(builder, texsize,
1263                                                         LLVMConstInt(ctx->i32, c, 0), "");
1264                         half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
1265                         half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, half_texel[c]);
1266                         half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
1267                                                       LLVMConstReal(ctx->f32, -0.5), "");
1268                 }
1269
1270                 if (wa_8888) {
1271                         lp_build_endif(&if_ctx);
1272
1273                         LLVMBasicBlockRef bb[2] = { if_ctx.true_block, if_ctx.entry_block };
1274
1275                         for (unsigned c = 0; c < 2; c++) {
1276                                 LLVMValueRef values[2] = { half_texel[c], ctx->ac.f32_0 };
1277                                 half_texel[c] = ac_build_phi(&ctx->ac, ctx->f32, 2,
1278                                                              values, bb);
1279                         }
1280                 }
1281         }
1282
1283         for (unsigned c = 0; c < 2; c++) {
1284                 LLVMValueRef tmp;
1285                 tmp = ac_to_float(&ctx->ac, args->coords[c]);
1286                 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
1287                 args->coords[c] = ac_to_integer(&ctx->ac, tmp);
1288         }
1289
1290         return wa_8888;
1291 }
1292
1293 /* The second half of the cube texture 8_8_8_8 integer workaround: adjust the
1294  * result after the gather operation.
1295  */
1296 static LLVMValueRef
1297 si_fix_gather4_integer_result(struct si_shader_context *ctx,
1298                            LLVMValueRef result,
1299                            enum tgsi_return_type return_type,
1300                            LLVMValueRef wa)
1301 {
1302         LLVMBuilderRef builder = ctx->ac.builder;
1303
1304         assert(return_type == TGSI_RETURN_TYPE_SINT ||
1305                return_type == TGSI_RETURN_TYPE_UINT);
1306
1307         for (unsigned chan = 0; chan < 4; ++chan) {
1308                 LLVMValueRef chanv = LLVMConstInt(ctx->i32, chan, false);
1309                 LLVMValueRef value;
1310                 LLVMValueRef wa_value;
1311
1312                 value = LLVMBuildExtractElement(builder, result, chanv, "");
1313
1314                 if (return_type == TGSI_RETURN_TYPE_UINT)
1315                         wa_value = LLVMBuildFPToUI(builder, value, ctx->i32, "");
1316                 else
1317                         wa_value = LLVMBuildFPToSI(builder, value, ctx->i32, "");
1318                 wa_value = ac_to_float(&ctx->ac, wa_value);
1319                 value = LLVMBuildSelect(builder, wa, wa_value, value, "");
1320
1321                 result = LLVMBuildInsertElement(builder, result, value, chanv, "");
1322         }
1323
1324         return result;
1325 }
1326
1327 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
1328                                 struct lp_build_tgsi_context *bld_base,
1329                                 struct lp_build_emit_data *emit_data)
1330 {
1331         struct si_shader_context *ctx = si_shader_context(bld_base);
1332         const struct tgsi_full_instruction *inst = emit_data->inst;
1333         unsigned opcode = inst->Instruction.Opcode;
1334         unsigned target = inst->Texture.Texture;
1335         struct ac_image_args args = {};
1336         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
1337         unsigned chan;
1338         bool has_offset = inst->Texture.NumOffsets > 0;
1339         LLVMValueRef fmask_ptr = NULL;
1340
1341         tex_fetch_ptrs(bld_base, emit_data, &args.resource, &args.sampler, &fmask_ptr);
1342
1343         if (target == TGSI_TEXTURE_BUFFER) {
1344                 LLVMValueRef vindex = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
1345                 unsigned num_channels =
1346                         util_last_bit(inst->Dst[0].Register.WriteMask);
1347                 LLVMValueRef result =
1348                         ac_build_buffer_load_format(&ctx->ac,
1349                                                     args.resource,
1350                                                     vindex,
1351                                                     ctx->i32_0,
1352                                                     num_channels, false, true);
1353                 emit_data->output[emit_data->chan] =
1354                         ac_build_expand_to_vec4(&ctx->ac, result, num_channels);
1355                 return;
1356         }
1357
1358         /* Fetch and project texture coordinates */
1359         args.coords[3] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_W);
1360         for (chan = 0; chan < 3; chan++) {
1361                 args.coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
1362                 if (opcode == TGSI_OPCODE_TXP)
1363                         args.coords[chan] = ac_build_fdiv(&ctx->ac,
1364                                 args.coords[chan], args.coords[3]);
1365         }
1366
1367         if (opcode == TGSI_OPCODE_TXP)
1368                 args.coords[3] = ctx->ac.f32_1;
1369
1370         /* Pack offsets. */
1371         if (has_offset &&
1372             opcode != TGSI_OPCODE_TXF &&
1373             opcode != TGSI_OPCODE_TXF_LZ) {
1374                 /* The offsets are six-bit signed integers packed like this:
1375                  *   X=[5:0], Y=[13:8], and Z=[21:16].
1376                  */
1377                 LLVMValueRef offset[3], pack;
1378
1379                 assert(inst->Texture.NumOffsets == 1);
1380
1381                 for (chan = 0; chan < 3; chan++) {
1382                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base, inst, 0, chan);
1383                         offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
1384                                                     LLVMConstInt(ctx->i32, 0x3f, 0), "");
1385                         if (chan)
1386                                 offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
1387                                                             LLVMConstInt(ctx->i32, chan*8, 0), "");
1388                 }
1389
1390                 pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
1391                 pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
1392                 args.offset = pack;
1393         }
1394
1395         /* Pack LOD bias value */
1396         if (opcode == TGSI_OPCODE_TXB)
1397                 args.bias = args.coords[3];
1398         if (opcode == TGSI_OPCODE_TXB2)
1399                 args.bias = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1400
1401         /* Pack depth comparison value */
1402         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
1403                 LLVMValueRef z;
1404
1405                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1406                         z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1407                 } else {
1408                         assert(ref_pos >= 0);
1409                         z = args.coords[ref_pos];
1410                 }
1411
1412                 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
1413                  * OpenGL 4.5 spec says:
1414                  *
1415                  *    "If the texture’s internal format indicates a fixed-point
1416                  *     depth texture, then D_t and D_ref are clamped to the
1417                  *     range [0, 1]; otherwise no clamping is performed."
1418                  *
1419                  * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
1420                  * so the depth comparison value isn't clamped for Z16 and
1421                  * Z24 anymore. Do it manually here.
1422                  */
1423                 if (ctx->screen->info.chip_class >= VI) {
1424                         LLVMValueRef upgraded;
1425                         LLVMValueRef clamped;
1426                         upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
1427                                                            LLVMConstInt(ctx->i32, 3, false), "");
1428                         upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded,
1429                                                  LLVMConstInt(ctx->i32, 29, false), "");
1430                         upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->i1, "");
1431                         clamped = ac_build_clamp(&ctx->ac, z);
1432                         z = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, z, "");
1433                 }
1434
1435                 args.compare = z;
1436         }
1437
1438         /* Pack user derivatives */
1439         if (opcode == TGSI_OPCODE_TXD) {
1440                 int param, num_src_deriv_channels, num_dst_deriv_channels;
1441
1442                 switch (target) {
1443                 case TGSI_TEXTURE_3D:
1444                         num_src_deriv_channels = 3;
1445                         num_dst_deriv_channels = 3;
1446                         break;
1447                 case TGSI_TEXTURE_2D:
1448                 case TGSI_TEXTURE_SHADOW2D:
1449                 case TGSI_TEXTURE_RECT:
1450                 case TGSI_TEXTURE_SHADOWRECT:
1451                 case TGSI_TEXTURE_2D_ARRAY:
1452                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
1453                         num_src_deriv_channels = 2;
1454                         num_dst_deriv_channels = 2;
1455                         break;
1456                 case TGSI_TEXTURE_CUBE:
1457                 case TGSI_TEXTURE_SHADOWCUBE:
1458                 case TGSI_TEXTURE_CUBE_ARRAY:
1459                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
1460                         /* Cube derivatives will be converted to 2D. */
1461                         num_src_deriv_channels = 3;
1462                         num_dst_deriv_channels = 3;
1463                         break;
1464                 case TGSI_TEXTURE_1D:
1465                 case TGSI_TEXTURE_SHADOW1D:
1466                 case TGSI_TEXTURE_1D_ARRAY:
1467                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
1468                         num_src_deriv_channels = 1;
1469
1470                         /* 1D textures are allocated and used as 2D on GFX9. */
1471                         if (ctx->screen->info.chip_class >= GFX9) {
1472                                 num_dst_deriv_channels = 2;
1473                         } else {
1474                                 num_dst_deriv_channels = 1;
1475                         }
1476                         break;
1477                 default:
1478                         unreachable("invalid target");
1479                 }
1480
1481                 for (param = 0; param < 2; param++) {
1482                         for (chan = 0; chan < num_src_deriv_channels; chan++)
1483                                 args.derivs[param * num_dst_deriv_channels + chan] =
1484                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
1485
1486                         /* Fill in the rest with zeros. */
1487                         for (chan = num_src_deriv_channels;
1488                              chan < num_dst_deriv_channels; chan++)
1489                                 args.derivs[param * num_dst_deriv_channels + chan] =
1490                                         ctx->ac.f32_0;
1491                 }
1492         }
1493
1494         if (target == TGSI_TEXTURE_CUBE ||
1495             target == TGSI_TEXTURE_CUBE_ARRAY ||
1496             target == TGSI_TEXTURE_SHADOWCUBE ||
1497             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1498                 ac_prepare_cube_coords(&ctx->ac,
1499                                        opcode == TGSI_OPCODE_TXD,
1500                                        target == TGSI_TEXTURE_CUBE_ARRAY ||
1501                                        target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
1502                                        opcode == TGSI_OPCODE_LODQ,
1503                                        args.coords, args.derivs);
1504         } else if (tgsi_is_array_sampler(target) &&
1505                    opcode != TGSI_OPCODE_TXF &&
1506                    opcode != TGSI_OPCODE_TXF_LZ &&
1507                    ctx->screen->info.chip_class <= VI) {
1508                 unsigned array_coord = target == TGSI_TEXTURE_1D_ARRAY ? 1 : 2;
1509                 args.coords[array_coord] =
1510                         ac_build_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32,
1511                                            &args.coords[array_coord], 1, 0);
1512         }
1513
1514         /* 1D textures are allocated and used as 2D on GFX9. */
1515         if (ctx->screen->info.chip_class >= GFX9) {
1516                 LLVMValueRef filler;
1517
1518                 /* Use 0.5, so that we don't sample the border color. */
1519                 if (opcode == TGSI_OPCODE_TXF ||
1520                     opcode == TGSI_OPCODE_TXF_LZ)
1521                         filler = ctx->i32_0;
1522                 else
1523                         filler = LLVMConstReal(ctx->f32, 0.5);
1524
1525                 if (target == TGSI_TEXTURE_1D ||
1526                     target == TGSI_TEXTURE_SHADOW1D) {
1527                         args.coords[1] = filler;
1528                 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
1529                            target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
1530                         args.coords[2] = args.coords[1];
1531                         args.coords[1] = filler;
1532                 }
1533         }
1534
1535         /* Pack LOD or sample index */
1536         if (opcode == TGSI_OPCODE_TXL)
1537                 args.lod = args.coords[3];
1538         else if (opcode == TGSI_OPCODE_TXL2)
1539                 args.lod = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
1540         else if (opcode == TGSI_OPCODE_TXF) {
1541                 if (target == TGSI_TEXTURE_2D_MSAA) {
1542                         /* No LOD, but move sample index into the right place. */
1543                         args.coords[2] = args.coords[3];
1544                 } else if (target != TGSI_TEXTURE_2D_ARRAY_MSAA) {
1545                         args.lod = args.coords[3];
1546                 }
1547         }
1548
1549         if (target == TGSI_TEXTURE_2D_MSAA ||
1550             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1551                 ac_apply_fmask_to_sample(&ctx->ac, fmask_ptr, args.coords,
1552                                          target == TGSI_TEXTURE_2D_ARRAY_MSAA);
1553         }
1554
1555         if (opcode == TGSI_OPCODE_TXF ||
1556             opcode == TGSI_OPCODE_TXF_LZ) {
1557                 /* add tex offsets */
1558                 if (inst->Texture.NumOffsets) {
1559                         const struct tgsi_texture_offset *off = inst->TexOffsets;
1560
1561                         assert(inst->Texture.NumOffsets == 1);
1562
1563                         switch (target) {
1564                         case TGSI_TEXTURE_3D:
1565                                 args.coords[2] =
1566                                         LLVMBuildAdd(ctx->ac.builder, args.coords[2],
1567                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ], "");
1568                                 /* fall through */
1569                         case TGSI_TEXTURE_2D:
1570                         case TGSI_TEXTURE_SHADOW2D:
1571                         case TGSI_TEXTURE_RECT:
1572                         case TGSI_TEXTURE_SHADOWRECT:
1573                         case TGSI_TEXTURE_2D_ARRAY:
1574                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
1575                                 args.coords[1] =
1576                                         LLVMBuildAdd(ctx->ac.builder, args.coords[1],
1577                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY], "");
1578                                 /* fall through */
1579                         case TGSI_TEXTURE_1D:
1580                         case TGSI_TEXTURE_SHADOW1D:
1581                         case TGSI_TEXTURE_1D_ARRAY:
1582                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
1583                                 args.coords[0] =
1584                                         LLVMBuildAdd(ctx->ac.builder, args.coords[0],
1585                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX], "");
1586                                 break;
1587                                 /* texture offsets do not apply to other texture targets */
1588                         }
1589                 }
1590         }
1591
1592         if (opcode == TGSI_OPCODE_TG4) {
1593                 unsigned gather_comp = 0;
1594
1595                 /* DMASK was repurposed for GATHER4. 4 components are always
1596                  * returned and DMASK works like a swizzle - it selects
1597                  * the component to fetch. The only valid DMASK values are
1598                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
1599                  * (red,red,red,red) etc.) The ISA document doesn't mention
1600                  * this.
1601                  */
1602
1603                 /* Get the component index from src1.x for Gather4. */
1604                 if (!tgsi_is_shadow_target(target)) {
1605                         LLVMValueRef comp_imm;
1606                         struct tgsi_src_register src1 = inst->Src[1].Register;
1607
1608                         assert(src1.File == TGSI_FILE_IMMEDIATE);
1609
1610                         comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
1611                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
1612                         gather_comp = CLAMP(gather_comp, 0, 3);
1613                 }
1614
1615                 args.dmask = 1 << gather_comp;
1616         } else {
1617                 args.dmask = 0xf;
1618         }
1619
1620         args.dim = ac_texture_dim_from_tgsi_target(ctx->screen, target);
1621         args.unorm = target == TGSI_TEXTURE_RECT ||
1622                      target == TGSI_TEXTURE_SHADOWRECT;
1623         args.opcode = ac_image_sample;
1624
1625         switch (opcode) {
1626         case TGSI_OPCODE_TXF:
1627         case TGSI_OPCODE_TXF_LZ:
1628                 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
1629                               target == TGSI_TEXTURE_2D_MSAA ||
1630                               target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
1631                                       ac_image_load : ac_image_load_mip;
1632                 break;
1633         case TGSI_OPCODE_LODQ:
1634                 args.opcode = ac_image_get_lod;
1635                 break;
1636         case TGSI_OPCODE_TEX:
1637         case TGSI_OPCODE_TEX2:
1638         case TGSI_OPCODE_TXP:
1639                 if (ctx->type != PIPE_SHADER_FRAGMENT)
1640                         args.level_zero = true;
1641                 break;
1642         case TGSI_OPCODE_TEX_LZ:
1643                 args.level_zero = true;
1644                 break;
1645         case TGSI_OPCODE_TXB:
1646         case TGSI_OPCODE_TXB2:
1647                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
1648                 break;
1649         case TGSI_OPCODE_TXL:
1650         case TGSI_OPCODE_TXL2:
1651                 break;
1652         case TGSI_OPCODE_TXD:
1653                 break;
1654         case TGSI_OPCODE_TG4:
1655                 args.opcode = ac_image_gather4;
1656                 args.level_zero = true;
1657                 break;
1658         default:
1659                 assert(0);
1660                 return;
1661         }
1662
1663         /* The hardware needs special lowering for Gather4 with integer formats. */
1664         LLVMValueRef gather4_int_result_workaround = NULL;
1665
1666         if (ctx->screen->info.chip_class <= VI &&
1667             opcode == TGSI_OPCODE_TG4) {
1668                 assert(inst->Texture.ReturnType != TGSI_RETURN_TYPE_UNKNOWN);
1669
1670                 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
1671                     inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
1672                         gather4_int_result_workaround =
1673                                 si_lower_gather4_integer(ctx, &args, target,
1674                                                          inst->Texture.ReturnType);
1675                 }
1676         }
1677
1678         args.attributes = AC_FUNC_ATTR_READNONE;
1679         LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
1680
1681         if (gather4_int_result_workaround) {
1682                 result = si_fix_gather4_integer_result(ctx, result,
1683                                                        inst->Texture.ReturnType,
1684                                                        gather4_int_result_workaround);
1685         }
1686
1687         emit_data->output[emit_data->chan] = result;
1688 }
1689
1690 static void si_llvm_emit_txqs(
1691         const struct lp_build_tgsi_action *action,
1692         struct lp_build_tgsi_context *bld_base,
1693         struct lp_build_emit_data *emit_data)
1694 {
1695         struct si_shader_context *ctx = si_shader_context(bld_base);
1696         LLVMValueRef res, samples;
1697         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
1698
1699         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
1700
1701         /* Read the samples from the descriptor directly. */
1702         res = LLVMBuildBitCast(ctx->ac.builder, res_ptr, ctx->v8i32, "");
1703         samples = LLVMBuildExtractElement(ctx->ac.builder, res,
1704                                           LLVMConstInt(ctx->i32, 3, 0), "");
1705         samples = LLVMBuildLShr(ctx->ac.builder, samples,
1706                                 LLVMConstInt(ctx->i32, 16, 0), "");
1707         samples = LLVMBuildAnd(ctx->ac.builder, samples,
1708                                LLVMConstInt(ctx->i32, 0xf, 0), "");
1709         samples = LLVMBuildShl(ctx->ac.builder, ctx->i32_1,
1710                                samples, "");
1711
1712         emit_data->output[emit_data->chan] = samples;
1713 }
1714
1715 static void si_llvm_emit_fbfetch(const struct lp_build_tgsi_action *action,
1716                                  struct lp_build_tgsi_context *bld_base,
1717                                  struct lp_build_emit_data *emit_data)
1718 {
1719         struct si_shader_context *ctx = si_shader_context(bld_base);
1720         struct ac_image_args args = {};
1721         LLVMValueRef ptr, image, fmask;
1722
1723         /* Ignore src0, because KHR_blend_func_extended disallows multiple render
1724          * targets.
1725          */
1726
1727         /* Load the image descriptor. */
1728         STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
1729         ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1730         ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
1731                                    ac_array_in_const32_addr_space(ctx->v8i32), "");
1732         image = ac_build_load_to_sgpr(&ctx->ac, ptr,
1733                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
1734
1735         unsigned chan = 0;
1736
1737         args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 0, 16);
1738
1739         if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1740                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 16, 16);
1741
1742         /* Get the current render target layer index. */
1743         if (ctx->shader->key.mono.u.ps.fbfetch_layered)
1744                 args.coords[chan++] = si_unpack_param(ctx, SI_PARAM_ANCILLARY, 16, 11);
1745
1746         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1747                 args.coords[chan++] = si_get_sample_id(ctx);
1748
1749         if (ctx->shader->key.mono.u.ps.fbfetch_msaa) {
1750                 fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
1751                         LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
1752
1753                 ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
1754                                          ctx->shader->key.mono.u.ps.fbfetch_layered);
1755         }
1756
1757         args.opcode = ac_image_load;
1758         args.resource = image;
1759         args.dmask = 0xf;
1760         if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
1761                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1762                         ac_image_2darraymsaa : ac_image_2dmsaa;
1763         else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
1764                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1765                         ac_image_1darray : ac_image_1d;
1766         else
1767                 args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
1768                         ac_image_2darray : ac_image_2d;
1769
1770         emit_data->output[emit_data->chan] =
1771                 ac_build_image_opcode(&ctx->ac, &args);
1772 }
1773
1774 /**
1775  * Setup actions for TGSI memory opcode, including texture opcodes.
1776  */
1777 void si_shader_context_init_mem(struct si_shader_context *ctx)
1778 {
1779         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1780
1781         bld_base->op_actions[TGSI_OPCODE_TEX].emit = build_tex_intrinsic;
1782         bld_base->op_actions[TGSI_OPCODE_TEX_LZ].emit = build_tex_intrinsic;
1783         bld_base->op_actions[TGSI_OPCODE_TEX2].emit = build_tex_intrinsic;
1784         bld_base->op_actions[TGSI_OPCODE_TXB].emit = build_tex_intrinsic;
1785         bld_base->op_actions[TGSI_OPCODE_TXB2].emit = build_tex_intrinsic;
1786         bld_base->op_actions[TGSI_OPCODE_TXD].emit = build_tex_intrinsic;
1787         bld_base->op_actions[TGSI_OPCODE_TXF].emit = build_tex_intrinsic;
1788         bld_base->op_actions[TGSI_OPCODE_TXF_LZ].emit = build_tex_intrinsic;
1789         bld_base->op_actions[TGSI_OPCODE_TXL].emit = build_tex_intrinsic;
1790         bld_base->op_actions[TGSI_OPCODE_TXL2].emit = build_tex_intrinsic;
1791         bld_base->op_actions[TGSI_OPCODE_TXP].emit = build_tex_intrinsic;
1792         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = resq_emit;
1793         bld_base->op_actions[TGSI_OPCODE_TG4].emit = build_tex_intrinsic;
1794         bld_base->op_actions[TGSI_OPCODE_LODQ].emit = build_tex_intrinsic;
1795         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
1796
1797         bld_base->op_actions[TGSI_OPCODE_FBFETCH].emit = si_llvm_emit_fbfetch;
1798
1799         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
1800         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
1801         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
1802
1803         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].emit = atomic_emit;
1804         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
1805         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].emit = atomic_emit;
1806         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
1807         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].emit = atomic_emit;
1808         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
1809         bld_base->op_actions[TGSI_OPCODE_ATOMAND].emit = atomic_emit;
1810         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
1811         bld_base->op_actions[TGSI_OPCODE_ATOMOR].emit = atomic_emit;
1812         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
1813         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].emit = atomic_emit;
1814         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
1815         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].emit = atomic_emit;
1816         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
1817         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].emit = atomic_emit;
1818         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
1819         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].emit = atomic_emit;
1820         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
1821         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].emit = atomic_emit;
1822         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
1823 }