src/intel/compiler/brw_fs_surface_builder.cpp

   1 /*
   2  * Copyright © 2013-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "isl/isl.h"
  25 #include "brw_fs_surface_builder.h"
  26 #include "brw_fs.h"
  27
  28 using namespace brw;
  29
  30 namespace brw {
  31    namespace surface_access {
  32       namespace {
  33          /**
  34           * Generate a logical send opcode for a surface message and return
  35           * the result.
  36           */
  37          fs_reg
  38          emit_send(const fs_builder &bld, enum opcode opcode,
  39                    const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
  40                    unsigned dims, unsigned arg, unsigned rsize,
  41                    brw_predicate pred = BRW_PREDICATE_NONE)
  42          {
  43             /* Reduce the dynamically uniform surface index to a single
  44              * scalar.
  45              */
  46             const fs_reg usurface = bld.emit_uniformize(surface);
  47             const fs_reg srcs[] = {
  48                addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
  49             };
  50             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
  51             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
  52
  53             inst->size_written = rsize * dst.component_size(inst->exec_size);
  54             inst->predicate = pred;
  55             return dst;
  56          }
  57       }
  58
  59       /**
  60        * Emit an untyped surface read opcode.  \p dims determines the number
  61        * of components of the address and \p size the number of components of
  62        * the returned value.
  63        */
  64       fs_reg
  65       emit_untyped_read(const fs_builder &bld,
  66                         const fs_reg &surface, const fs_reg &addr,
  67                         unsigned dims, unsigned size,
  68                         brw_predicate pred)
  69       {
  70          return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
  71                           addr, fs_reg(), surface, dims, size, size, pred);
  72       }
  73
  74       /**
  75        * Emit an untyped surface write opcode.  \p dims determines the number
  76        * of components of the address and \p size the number of components of
  77        * the argument.
  78        */
  79       void
  80       emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
  81                          const fs_reg &addr, const fs_reg &src,
  82                          unsigned dims, unsigned size,
  83                          brw_predicate pred)
  84       {
  85          emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
  86                    addr, src, surface, dims, size, 0, pred);
  87       }
  88
  89       /**
  90        * Emit an untyped surface atomic opcode.  \p dims determines the number
  91        * of components of the address and \p rsize the number of components of
  92        * the returned value (either zero or one).
  93        */
  94       fs_reg
  95       emit_untyped_atomic(const fs_builder &bld,
  96                           const fs_reg &surface, const fs_reg &addr,
  97                           const fs_reg &src0, const fs_reg &src1,
  98                           unsigned dims, unsigned rsize, unsigned op,
  99                           brw_predicate pred)
 100       {
 101          /* FINISHME: Factor out this frequently recurring pattern into a
 102           * helper function.
 103           */
 104          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
 105          const fs_reg srcs[] = { src0, src1 };
 106          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
 107          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
 108
 109          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
 110                           addr, tmp, surface, dims, op, rsize, pred);
 111       }
 112
 113       /**
 114        * Emit a typed surface read opcode.  \p dims determines the number of
 115        * components of the address and \p size the number of components of the
 116        * returned value.
 117        */
 118       fs_reg
 119       emit_typed_read(const fs_builder &bld, const fs_reg &surface,
 120                       const fs_reg &addr, unsigned dims, unsigned size)
 121       {
 122          return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
 123                           addr, fs_reg(), surface, dims, size, size);
 124       }
 125
 126       /**
 127        * Emit a typed surface write opcode.  \p dims determines the number of
 128        * components of the address and \p size the number of components of the
 129        * argument.
 130        */
 131       void
 132       emit_typed_write(const fs_builder &bld, const fs_reg &surface,
 133                        const fs_reg &addr, const fs_reg &src,
 134                        unsigned dims, unsigned size)
 135       {
 136          emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
 137                    addr, src, surface, dims, size, 0);
 138       }
 139
 140       /**
 141        * Emit a typed surface atomic opcode.  \p dims determines the number of
 142        * components of the address and \p rsize the number of components of
 143        * the returned value (either zero or one).
 144        */
 145       fs_reg
 146       emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
 147                         const fs_reg &addr,
 148                         const fs_reg &src0, const fs_reg &src1,
 149                         unsigned dims, unsigned rsize, unsigned op,
 150                         brw_predicate pred)
 151       {
 152          /* FINISHME: Factor out this frequently recurring pattern into a
 153           * helper function.
 154           */
 155          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
 156          const fs_reg srcs[] = { src0, src1 };
 157          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
 158          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
 159
 160          return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
 161                           addr, tmp, surface, dims, op, rsize);
 162       }
 163
 164       fs_reg
 165       emit_byte_scattered_read(const fs_builder &bld,
 166                                const fs_reg &surface, const fs_reg &addr,
 167                                unsigned dims, unsigned size,
 168                                unsigned bit_size, brw_predicate pred)
 169       {
 170          return emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
 171                           addr, fs_reg(), surface, dims, bit_size, size, pred);
 172       }
 173
 174       void
 175       emit_byte_scattered_write(const fs_builder &bld, const fs_reg &surface,
 176                                 const fs_reg &addr, const fs_reg &src,
 177                                 unsigned dims, unsigned size,
 178                                 unsigned bit_size, brw_predicate pred)
 179       {
 180          emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
 181                    addr, src, surface, dims, bit_size, 0, pred);
 182       }
 183    }
 184 }
 185
 186 namespace {
 187    namespace image_format_info {
 188       /* The higher compiler layers use the GL enums for image formats even if
 189        * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
 190        * enum before we can use them.
 191        */
 192       static enum isl_format
 193       isl_format_for_gl_format(uint32_t gl_format)
 194       {
 195          switch (gl_format) {
 196          case GL_R8:             return ISL_FORMAT_R8_UNORM;
 197          case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
 198          case GL_R8UI:           return ISL_FORMAT_R8_UINT;
 199          case GL_R8I:            return ISL_FORMAT_R8_SINT;
 200          case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
 201          case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
 202          case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
 203          case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
 204          case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
 205          case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
 206          case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
 207          case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
 208          case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
 209          case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
 210          case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
 211          case GL_R16:            return ISL_FORMAT_R16_UNORM;
 212          case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
 213          case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
 214          case GL_R16UI:          return ISL_FORMAT_R16_UINT;
 215          case GL_R16I:           return ISL_FORMAT_R16_SINT;
 216          case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
 217          case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
 218          case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
 219          case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
 220          case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
 221          case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
 222          case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
 223          case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
 224          case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
 225          case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
 226          case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
 227          case GL_R32UI:          return ISL_FORMAT_R32_UINT;
 228          case GL_R32I:           return ISL_FORMAT_R32_SINT;
 229          case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
 230          case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
 231          case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
 232          case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
 233          case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
 234          case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
 235          case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
 236          default:
 237             assert(!"Invalid image format");
 238             return ISL_FORMAT_UNSUPPORTED;
 239          }
 240       }
 241
 242       /**
 243        * Simple 4-tuple of scalars used to pass around per-color component
 244        * values.
 245        */
 246       struct color_u {
 247          color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
 248          {
 249          }
 250
 251          color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
 252             r(r), g(g), b(b), a(a)
 253          {
 254          }
 255
 256          unsigned
 257          operator[](unsigned i) const
 258          {
 259             const unsigned xs[] = { r, g, b, a };
 260             return xs[i];
 261          }
 262
 263          unsigned r, g, b, a;
 264       };
 265
 266       /**
 267        * Return the per-channel bitfield widths for a given image format.
 268        */
 269       inline color_u
 270       get_bit_widths(isl_format format)
 271       {
 272          const isl_format_layout *fmtl = isl_format_get_layout(format);
 273
 274          return color_u(fmtl->channels.r.bits,
 275                         fmtl->channels.g.bits,
 276                         fmtl->channels.b.bits,
 277                         fmtl->channels.a.bits);
 278       }
 279
 280       /**
 281        * Return the per-channel bitfield shifts for a given image format.
 282        */
 283       inline color_u
 284       get_bit_shifts(isl_format format)
 285       {
 286          const color_u widths = get_bit_widths(format);
 287          return color_u(0, widths.r, widths.r + widths.g,
 288                         widths.r + widths.g + widths.b);
 289       }
 290
 291       /**
 292        * Return true if all present components have the same bit width.
 293        */
 294       inline bool
 295       is_homogeneous(isl_format format)
 296       {
 297          const color_u widths = get_bit_widths(format);
 298          return ((widths.g == 0 || widths.g == widths.r) &&
 299                  (widths.b == 0 || widths.b == widths.r) &&
 300                  (widths.a == 0 || widths.a == widths.r));
 301       }
 302
 303       /**
 304        * Return true if the format conversion boils down to a trivial copy.
 305        */
 306       inline bool
 307       is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
 308       {
 309          return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
 310                  format == isl_lower_storage_image_format(devinfo, format);
 311       }
 312
 313       /**
 314        * Return true if the hardware natively supports some format with
 315        * compatible bitfield layout, but possibly different data types.
 316        */
 317       inline bool
 318       has_supported_bit_layout(const gen_device_info *devinfo,
 319                                isl_format format)
 320       {
 321          const color_u widths = get_bit_widths(format);
 322          const color_u lower_widths = get_bit_widths(
 323             isl_lower_storage_image_format(devinfo, format));
 324
 325          return (widths.r == lower_widths.r &&
 326                  widths.g == lower_widths.g &&
 327                  widths.b == lower_widths.b &&
 328                  widths.a == lower_widths.a);
 329       }
 330
 331       /**
 332        * Return true if we are required to spread individual components over
 333        * several components of the format used by the hardware (RG32 and
 334        * friends implemented as RGBA16UI).
 335        */
 336       inline bool
 337       has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
 338       {
 339          const isl_format lower_format =
 340             isl_lower_storage_image_format(devinfo, format);
 341
 342          return (isl_format_get_num_channels(format) <
 343                  isl_format_get_num_channels(lower_format));
 344       }
 345
 346       /**
 347        * Return true if the hardware returns garbage in the unused high bits
 348        * of each component.  This may happen on IVB because we rely on the
 349        * undocumented behavior that typed reads from surfaces of the
 350        * unsupported R8 and R16 formats return useful data in their least
 351        * significant bits.
 352        */
 353       inline bool
 354       has_undefined_high_bits(const gen_device_info *devinfo,
 355                               isl_format format)
 356       {
 357          const isl_format lower_format =
 358             isl_lower_storage_image_format(devinfo, format);
 359
 360          return (devinfo->gen == 7 && !devinfo->is_haswell &&
 361                  (lower_format == ISL_FORMAT_R16_UINT ||
 362                   lower_format == ISL_FORMAT_R8_UINT));
 363       }
 364
 365       /**
 366        * Return true if the format represents values as signed integers
 367        * requiring sign extension when unpacking.
 368        */
 369       inline bool
 370       needs_sign_extension(isl_format format)
 371       {
 372          return isl_format_has_snorm_channel(format) ||
 373                 isl_format_has_sint_channel(format);
 374       }
 375    }
 376
 377    namespace image_validity {
 378       /**
 379        * Check whether the bound image is suitable for untyped access.
 380        */
 381       static brw_predicate
 382       emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
 383                                brw_predicate pred)
 384       {
 385          const gen_device_info *devinfo = bld.shader->devinfo;
 386          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
 387
 388          if (devinfo->gen == 7 && !devinfo->is_haswell) {
 389             /* Check whether the first stride component (i.e. the Bpp value)
 390              * is greater than four, what on Gen7 indicates that a surface of
 391              * type RAW has been bound for untyped access.  Reading or writing
 392              * to a surface of type other than RAW using untyped surface
 393              * messages causes a hang on IVB and VLV.
 394              */
 395             set_predicate(pred,
 396                           bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
 397                                   BRW_CONDITIONAL_G));
 398
 399             return BRW_PREDICATE_NORMAL;
 400          } else {
 401             /* More recent generations handle the format mismatch
 402              * gracefully.
 403              */
 404             return pred;
 405          }
 406       }
 407
 408       /**
 409        * Check whether there is an image bound at the given index and write
 410        * the comparison result to f0.0.  Returns an appropriate predication
 411        * mode to use on subsequent image operations.
 412        */
 413       static brw_predicate
 414       emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
 415       {
 416          const gen_device_info *devinfo = bld.shader->devinfo;
 417          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
 418
 419          if (devinfo->gen == 7 && !devinfo->is_haswell) {
 420             /* Check the first component of the size field to find out if the
 421              * image is bound.  Necessary on IVB for typed atomics because
 422              * they don't seem to respect null surfaces and will happily
 423              * corrupt or read random memory when no image is bound.
 424              */
 425             bld.CMP(bld.null_reg_ud(),
 426                     retype(size, BRW_REGISTER_TYPE_UD),
 427                     brw_imm_d(0), BRW_CONDITIONAL_NZ);
 428
 429             return BRW_PREDICATE_NORMAL;
 430          } else {
 431             /* More recent platforms implement compliant behavior when a null
 432              * surface is bound.
 433              */
 434             return BRW_PREDICATE_NONE;
 435          }
 436       }
 437
 438       /**
 439        * Check whether the provided coordinates are within the image bounds
 440        * and write the comparison result to f0.0.  Returns an appropriate
 441        * predication mode to use on subsequent image operations.
 442        */
 443       static brw_predicate
 444       emit_bounds_check(const fs_builder &bld, const fs_reg &image,
 445                         const fs_reg &addr, unsigned dims)
 446       {
 447          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
 448
 449          for (unsigned c = 0; c < dims; ++c)
 450             set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
 451                           bld.CMP(bld.null_reg_ud(),
 452                                   offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
 453                                   offset(size, bld, c),
 454                                   BRW_CONDITIONAL_L));
 455
 456          return BRW_PREDICATE_NORMAL;
 457       }
 458    }
 459
 460    namespace image_coordinates {
 461       /**
 462        * Return the total number of coordinates needed to address a texel of
 463        * the surface, which may be more than the sum of \p surf_dims and \p
 464        * arr_dims if padding is required.
 465        */
 466       static unsigned
 467       num_image_coordinates(const fs_builder &bld,
 468                             unsigned surf_dims, unsigned arr_dims,
 469                             isl_format format)
 470       {
 471          /* HSW in vec4 mode and our software coordinate handling for untyped
 472           * reads want the array index to be at the Z component.
 473           */
 474          const bool array_index_at_z =
 475             format != ISL_FORMAT_UNSUPPORTED &&
 476             !isl_has_matching_typed_storage_image_format(
 477                bld.shader->devinfo, format);
 478          const unsigned zero_dims =
 479             ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
 480
 481          return surf_dims + zero_dims + arr_dims;
 482       }
 483
 484       /**
 485        * Transform image coordinates into the form expected by the
 486        * implementation.
 487        */
 488       static fs_reg
 489       emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
 490                              unsigned surf_dims, unsigned arr_dims,
 491                              isl_format format)
 492       {
 493          const unsigned dims =
 494             num_image_coordinates(bld, surf_dims, arr_dims, format);
 495
 496          if (dims > surf_dims + arr_dims) {
 497             assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
 498             /* The array index is required to be passed in as the Z component,
 499              * insert a zero at the Y component to shift it to the right
 500              * position.
 501              *
 502              * FINISHME: Factor out this frequently recurring pattern into a
 503              * helper function.
 504              */
 505             const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
 506             const fs_reg dst = bld.vgrf(addr.type, dims);
 507             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
 508             return dst;
 509          } else {
 510             return addr;
 511          }
 512       }
 513
 514       /**
 515        * Calculate the offset in memory of the texel given by \p coord.
 516        *
 517        * This is meant to be used with untyped surface messages to access a
 518        * tiled surface, what involves taking into account the tiling and
 519        * swizzling modes of the surface manually so it will hopefully not
 520        * happen very often.
 521        *
 522        * The tiling algorithm implemented here matches either the X or Y
 523        * tiling layouts supported by the hardware depending on the tiling
 524        * coefficients passed to the program as uniforms.  See Volume 1 Part 2
 525        * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
 526        * explanation of the hardware tiling format.
 527        */
 528       static fs_reg
 529       emit_address_calculation(const fs_builder &bld, const fs_reg &image,
 530                                const fs_reg &coord, unsigned dims)
 531       {
 532          const gen_device_info *devinfo = bld.shader->devinfo;
 533          const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
 534          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
 535          const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
 536          const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
 537          const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 538          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 539          const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 540          const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 541          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
 542
 543          /* Shift the coordinates by the fixed surface offset.  It may be
 544           * non-zero if the image is a single slice of a higher-dimensional
 545           * surface, or if a non-zero mipmap level of the surface is bound to
 546           * the pipeline.  The offset needs to be applied here rather than at
 547           * surface state set-up time because the desired slice-level may
 548           * start mid-tile, so simply shifting the surface base address
 549           * wouldn't give a well-formed tiled surface in the general case.
 550           */
 551          for (unsigned c = 0; c < 2; ++c)
 552             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
 553                     (c < dims ?
 554                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
 555                      fs_reg(brw_imm_d(0))));
 556
 557          /* The layout of 3-D textures in memory is sort-of like a tiling
 558           * format.  At each miplevel, the slices are arranged in rows of
 559           * 2^level slices per row.  The slice row is stored in tmp.y and
 560           * the slice within the row is stored in tmp.x.
 561           *
 562           * The layout of 2-D array textures and cubemaps is much simpler:
 563           * Depending on whether the ARYSPC_LOD0 layout is in use it will be
 564           * stored in memory as an array of slices, each one being a 2-D
 565           * arrangement of miplevels, or as a 2D arrangement of miplevels,
 566           * each one being an array of slices.  In either case the separation
 567           * between slices of the same LOD is equal to the qpitch value
 568           * provided as stride.w.
 569           *
 570           * This code can be made to handle either 2D arrays and 3D textures
 571           * by passing in the miplevel as tile.z for 3-D textures and 0 in
 572           * tile.z for 2-D array textures.
 573           *
 574           * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
 575           * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
 576           * of the hardware 3D texture and 2D array layouts.
 577           */
 578          if (dims > 2) {
 579             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
 580              * index.
 581              */
 582             bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
 583                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
 584             bld.SHR(offset(tmp, bld, 1),
 585                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
 586                     offset(tile, bld, 2));
 587
 588             /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
 589              * slice offset.
 590              */
 591             for (unsigned c = 0; c < 2; ++c) {
 592                bld.MUL(offset(tmp, bld, c),
 593                        offset(stride, bld, 2 + c), offset(tmp, bld, c));
 594                bld.ADD(offset(addr, bld, c),
 595                        offset(addr, bld, c), offset(tmp, bld, c));
 596             }
 597          }
 598
 599          if (dims > 1) {
 600             /* Calculate the major/minor x and y indices.  In order to
 601              * accommodate both X and Y tiling, the Y-major tiling format is
 602              * treated as being a bunch of narrow X-tiles placed next to each
 603              * other.  This means that the tile width for Y-tiling is actually
 604              * the width of one sub-column of the Y-major tile where each 4K
 605              * tile has 8 512B sub-columns.
 606              *
 607              * The major Y value is the row of tiles in which the pixel lives.
 608              * The major X value is the tile sub-column in which the pixel
 609              * lives; for X tiling, this is the same as the tile column, for Y
 610              * tiling, each tile has 8 sub-columns.  The minor X and Y indices
 611              * are the position within the sub-column.
 612              */
 613             for (unsigned c = 0; c < 2; ++c) {
 614                /* Calculate the minor x and y indices. */
 615                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
 616                        brw_imm_d(0), offset(addr, bld, c));
 617
 618                /* Calculate the major x and y indices. */
 619                bld.SHR(offset(major, bld, c),
 620                        offset(addr, bld, c), offset(tile, bld, c));
 621             }
 622
 623             /* Calculate the texel index from the start of the tile row and
 624              * the vertical coordinate of the row.
 625              * Equivalent to:
 626              *   tmp.x = (major.x << tile.y << tile.x) +
 627              *           (minor.y << tile.x) + minor.x
 628              *   tmp.y = major.y << tile.y
 629              */
 630             bld.SHL(tmp, major, offset(tile, bld, 1));
 631             bld.ADD(tmp, tmp, offset(minor, bld, 1));
 632             bld.SHL(tmp, tmp, offset(tile, bld, 0));
 633             bld.ADD(tmp, tmp, minor);
 634             bld.SHL(offset(tmp, bld, 1),
 635                     offset(major, bld, 1), offset(tile, bld, 1));
 636
 637             /* Add it to the start of the tile row. */
 638             bld.MUL(offset(tmp, bld, 1),
 639                     offset(tmp, bld, 1), offset(stride, bld, 1));
 640             bld.ADD(tmp, tmp, offset(tmp, bld, 1));
 641
 642             /* Multiply by the Bpp value. */
 643             bld.MUL(dst, tmp, stride);
 644
 645             if (devinfo->gen < 8 && !devinfo->is_baytrail) {
 646                /* Take into account the two dynamically specified shifts.
 647                 * Both need are used to implement swizzling of X-tiled
 648                 * surfaces.  For Y-tiled surfaces only one bit needs to be
 649                 * XOR-ed with bit 6 of the memory address, so a swz value of
 650                 * 0xff (actually interpreted as 31 by the hardware) will be
 651                 * provided to cause the relevant bit of tmp.y to be zero and
 652                 * turn the first XOR into the identity.  For linear surfaces
 653                 * or platforms lacking address swizzling both shifts will be
 654                 * 0xff causing the relevant bits of both tmp.x and .y to be
 655                 * zero, what effectively disables swizzling.
 656                 */
 657                for (unsigned c = 0; c < 2; ++c)
 658                   bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
 659
 660                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
 661                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
 662                bld.AND(tmp, tmp, brw_imm_d(1 << 6));
 663                bld.XOR(dst, dst, tmp);
 664             }
 665
 666          } else {
 667             /* Multiply by the Bpp/stride value.  Note that the addr.y may be
 668              * non-zero even if the image is one-dimensional because a
 669              * vertical offset may have been applied above to select a
 670              * non-zero slice or level of a higher-dimensional texture.
 671              */
 672             bld.MUL(offset(addr, bld, 1),
 673                     offset(addr, bld, 1), offset(stride, bld, 1));
 674             bld.ADD(addr, addr, offset(addr, bld, 1));
 675             bld.MUL(dst, addr, stride);
 676          }
 677
 678          return dst;
 679       }
 680    }
 681
 682    namespace image_format_conversion {
 683       using image_format_info::color_u;
 684
 685       namespace {
 686          /**
 687           * Maximum representable value in an unsigned integer with the given
 688           * number of bits.
 689           */
 690          inline unsigned
 691          scale(unsigned n)
 692          {
 693             return (1 << n) - 1;
 694          }
 695       }
 696
 697       /**
 698        * Pack the vector \p src in a bitfield given the per-component bit
 699        * shifts and widths.  Note that bitfield components are not allowed to
 700        * cross 32-bit boundaries.
 701        */
 702       static fs_reg
 703       emit_pack(const fs_builder &bld, const fs_reg &src,
 704                 const color_u &shifts, const color_u &widths)
 705       {
 706          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 707          bool seen[4] = {};
 708
 709          for (unsigned c = 0; c < 4; ++c) {
 710             if (widths[c]) {
 711                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
 712
 713                /* Shift each component left to the correct bitfield position. */
 714                bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
 715
 716                /* Add everything up. */
 717                if (seen[shifts[c] / 32]) {
 718                   bld.OR(offset(dst, bld, shifts[c] / 32),
 719                          offset(dst, bld, shifts[c] / 32), tmp);
 720                } else {
 721                   bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
 722                   seen[shifts[c] / 32] = true;
 723                }
 724             }
 725          }
 726
 727          return dst;
 728       }
 729
 730       /**
 731        * Unpack a vector from the bitfield \p src given the per-component bit
 732        * shifts and widths.  Note that bitfield components are not allowed to
 733        * cross 32-bit boundaries.
 734        */
 735       static fs_reg
 736       emit_unpack(const fs_builder &bld, const fs_reg &src,
 737                   const color_u &shifts, const color_u &widths)
 738       {
 739          const fs_reg dst = bld.vgrf(src.type, 4);
 740
 741          for (unsigned c = 0; c < 4; ++c) {
 742             if (widths[c]) {
 743                /* Shift left to discard the most significant bits. */
 744                bld.SHL(offset(dst, bld, c),
 745                        offset(src, bld, shifts[c] / 32),
 746                        brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
 747
 748                /* Shift back to the least significant bits using an arithmetic
 749                 * shift to get sign extension on signed types.
 750                 */
 751                bld.ASR(offset(dst, bld, c),
 752                        offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
 753             }
 754          }
 755
 756          return dst;
 757       }
 758
 759       /**
 760        * Convert an integer vector into another integer vector of the
 761        * specified bit widths, properly handling overflow.
 762        */
 763       static fs_reg
 764       emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
 765                               const color_u &widths, bool is_signed)
 766       {
 767          const unsigned s = (is_signed ? 1 : 0);
 768          const fs_reg dst = bld.vgrf(
 769             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
 770          assert(src.type == dst.type);
 771
 772          for (unsigned c = 0; c < 4; ++c) {
 773             if (widths[c]) {
 774                /* Clamp to the maximum value. */
 775                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
 776                                brw_imm_d((int)scale(widths[c] - s)),
 777                                BRW_CONDITIONAL_L);
 778
 779                /* Clamp to the minimum value. */
 780                if (is_signed)
 781                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
 782                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
 783                                   BRW_CONDITIONAL_GE);
 784
 785                /* Mask off all but the bits we actually want.  Otherwise, if
 786                 * we pass a negative number into the hardware when it's
 787                 * expecting something like UINT8, it will happily clamp it to
 788                 * +255 for us.
 789                 */
 790                if (is_signed && widths[c] < 32)
 791                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
 792                           brw_imm_d(scale(widths[c])));
 793             }
 794          }
 795
 796          return dst;
 797       }
 798
 799       /**
 800        * Convert a normalized fixed-point vector of the specified signedness
 801        * and bit widths into a floating point vector.
 802        */
 803       static fs_reg
 804       emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
 805                                const color_u &widths, bool is_signed)
 806       {
 807          const unsigned s = (is_signed ? 1 : 0);
 808          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
 809
 810          for (unsigned c = 0; c < 4; ++c) {
 811             if (widths[c]) {
 812                /* Convert to float. */
 813                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
 814
 815                /* Divide by the normalization constants. */
 816                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
 817                        brw_imm_f(1.0f / scale(widths[c] - s)));
 818
 819                /* Clamp to the minimum value. */
 820                if (is_signed)
 821                   bld.emit_minmax(offset(dst, bld, c),
 822                                   offset(dst, bld, c), brw_imm_f(-1.0f),
 823                                   BRW_CONDITIONAL_GE);
 824             }
 825          }
 826          return dst;
 827       }
 828
 829       /**
 830        * Convert a floating-point vector into a normalized fixed-point vector
 831        * of the specified signedness and bit widths.
 832        */
 833       static fs_reg
 834       emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
 835                              const color_u &widths, bool is_signed)
 836       {
 837          const unsigned s = (is_signed ? 1 : 0);
 838          const fs_reg dst = bld.vgrf(
 839             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
 840          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 841
 842          for (unsigned c = 0; c < 4; ++c) {
 843             if (widths[c]) {
 844                /* Clamp the normalized floating-point argument. */
 845                if (is_signed) {
 846                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
 847                                   brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
 848
 849                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
 850                                   brw_imm_f(1.0f), BRW_CONDITIONAL_L);
 851                } else {
 852                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
 853                                              offset(src, bld, c)));
 854                }
 855
 856                /* Multiply by the normalization constants. */
 857                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
 858                        brw_imm_f((float)scale(widths[c] - s)));
 859
 860                /* Convert to integer. */
 861                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
 862                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
 863
 864                /* Mask off all but the bits we actually want.  Otherwise, if
 865                 * we pass a negative number into the hardware when it's
 866                 * expecting something like UINT8, it will happily clamp it to
 867                 * +255 for us.
 868                 */
 869                if (is_signed && widths[c] < 32)
 870                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
 871                           brw_imm_d(scale(widths[c])));
 872             }
 873          }
 874
 875          return dst;
 876       }
 877
 878       /**
 879        * Convert a floating point vector of the specified bit widths into a
 880        * 32-bit floating point vector.
 881        */
 882       static fs_reg
 883       emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
 884                               const color_u &widths)
 885       {
 886          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 887          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 888
 889          for (unsigned c = 0; c < 4; ++c) {
 890             if (widths[c]) {
 891                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
 892
 893                /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
 894                 * This works because they have a 5-bit exponent just like the
 895                 * 16-bit floating point format, and they have no sign bit.
 896                 */
 897                if (widths[c] < 16)
 898                   bld.SHL(offset(dst, bld, c),
 899                           offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
 900
 901                /* Convert to 32-bit floating point. */
 902                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
 903             }
 904          }
 905
 906          return fdst;
 907       }
 908
 909       /**
 910        * Convert a vector into a floating point vector of the specified bit
 911        * widths.
 912        */
 913       static fs_reg
 914       emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
 915                             const color_u &widths)
 916       {
 917          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 918          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 919
 920          for (unsigned c = 0; c < 4; ++c) {
 921             if (widths[c]) {
 922                bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
 923
 924                /* Clamp to the minimum value. */
 925                if (widths[c] < 16)
 926                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
 927                                   brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
 928
 929                /* Convert to 16-bit floating-point. */
 930                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
 931
 932                /* Discard the least significant bits to get floating point
 933                 * numbers of the requested width.  This works because the
 934                 * 10-bit and 11-bit floating point formats have a 5-bit
 935                 * exponent just like the 16-bit format, and they have no sign
 936                 * bit.
 937                 */
 938                if (widths[c] < 16)
 939                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
 940                           brw_imm_ud(15 - widths[c]));
 941             }
 942          }
 943
 944          return dst;
 945       }
 946
 947       /**
 948        * Fill missing components of a vector with 0, 0, 0, 1.
 949        */
 950       static fs_reg
 951       emit_pad(const fs_builder &bld, const fs_reg &src,
 952                const color_u &widths)
 953       {
 954          const fs_reg dst = bld.vgrf(src.type, 4);
 955          const unsigned pad[] = { 0, 0, 0, 1 };
 956
 957          for (unsigned c = 0; c < 4; ++c)
 958             bld.MOV(offset(dst, bld, c),
 959                     widths[c] ? offset(src, bld, c)
 960                               : fs_reg(brw_imm_ud(pad[c])));
 961
 962          return dst;
 963       }
 964    }
 965 }
 966
 967 namespace brw {
 968    namespace image_access {
 969       /**
 970        * Load a vector from a surface of the given format and dimensionality
 971        * at the given coordinates.  \p surf_dims and \p arr_dims give the
 972        * number of non-array and array coordinates of the image respectively.
 973        */
 974       fs_reg
 975       emit_image_load(const fs_builder &bld,
 976                       const fs_reg &image, const fs_reg &addr,
 977                       unsigned surf_dims, unsigned arr_dims,
 978                       unsigned gl_format)
 979       {
 980          using namespace image_format_info;
 981          using namespace image_format_conversion;
 982          using namespace image_validity;
 983          using namespace image_coordinates;
 984          using namespace surface_access;
 985          const gen_device_info *devinfo = bld.shader->devinfo;
 986          const isl_format format = isl_format_for_gl_format(gl_format);
 987          const isl_format lower_format =
 988             isl_lower_storage_image_format(devinfo, format);
 989          fs_reg tmp;
 990
 991          /* Transform the image coordinates into actual surface coordinates. */
 992          const fs_reg saddr =
 993             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
 994          const unsigned dims =
 995             num_image_coordinates(bld, surf_dims, arr_dims, format);
 996
 997          if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
 998             /* Hopefully we get here most of the time... */
 999             tmp = emit_typed_read(bld, image, saddr, dims,
1000                                   isl_format_get_num_channels(lower_format));
1001          } else {
1002             /* Untyped surface reads return 32 bits of the surface per
1003              * component, without any sort of unpacking or type conversion,
1004              */
1005             const unsigned size = isl_format_get_layout(format)->bpb / 32;
1006             /* they don't properly handle out of bounds access, so we have to
1007              * check manually if the coordinates are valid and predicate the
1008              * surface read on the result,
1009              */
1010             const brw_predicate pred =
1011                emit_untyped_image_check(bld, image,
1012                                         emit_bounds_check(bld, image,
1013                                                           saddr, dims));
1014
1015             /* and they don't know about surface coordinates, we need to
1016              * convert them to a raw memory offset.
1017              */
1018             const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
1019
1020             tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
1021
1022             /* An out of bounds surface access should give zero as result. */
1023             for (unsigned c = 0; c < size; ++c)
1024                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
1025                                            offset(tmp, bld, c), brw_imm_d(0)));
1026          }
1027
1028          /* Set the register type to D instead of UD if the data type is
1029           * represented as a signed integer in memory so that sign extension
1030           * is handled correctly by unpack.
1031           */
1032          if (needs_sign_extension(format))
1033             tmp = retype(tmp, BRW_REGISTER_TYPE_D);
1034
1035          if (!has_supported_bit_layout(devinfo, format)) {
1036             /* Unpack individual vector components from the bitfield if the
1037              * hardware is unable to do it for us.
1038              */
1039             if (has_split_bit_layout(devinfo, format))
1040                tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
1041                                get_bit_widths(lower_format));
1042             else
1043                tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
1044                                  get_bit_widths(format));
1045
1046          } else if ((needs_sign_extension(format) &&
1047                      !is_conversion_trivial(devinfo, format)) ||
1048                     has_undefined_high_bits(devinfo, format)) {
1049             /* Perform a trivial unpack even though the bit layout matches in
1050              * order to get the most significant bits of each component
1051              * initialized properly.
1052              */
1053             tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
1054                               get_bit_widths(format));
1055          }
1056
1057          if (!isl_format_has_int_channel(format)) {
1058             if (is_conversion_trivial(devinfo, format)) {
1059                /* Just need to cast the vector to the target type. */
1060                tmp = retype(tmp, BRW_REGISTER_TYPE_F);
1061             } else {
1062                /* Do the right sort of type conversion to float. */
1063                if (isl_format_has_float_channel(format))
1064                   tmp = emit_convert_from_float(
1065                      bld, tmp, get_bit_widths(format));
1066                else
1067                   tmp = emit_convert_from_scaled(
1068                      bld, tmp, get_bit_widths(format),
1069                      isl_format_has_snorm_channel(format));
1070             }
1071          }
1072
1073          /* Initialize missing components of the result. */
1074          return emit_pad(bld, tmp, get_bit_widths(format));
1075       }
1076
1077       /**
1078        * Store a vector in a surface of the given format and dimensionality at
1079        * the given coordinates.  \p surf_dims and \p arr_dims give the number
1080        * of non-array and array coordinates of the image respectively.
1081        */
1082       void
1083       emit_image_store(const fs_builder &bld, const fs_reg &image,
1084                        const fs_reg &addr, const fs_reg &src,
1085                        unsigned surf_dims, unsigned arr_dims,
1086                        unsigned gl_format)
1087       {
1088          using namespace image_format_info;
1089          using namespace image_format_conversion;
1090          using namespace image_validity;
1091          using namespace image_coordinates;
1092          using namespace surface_access;
1093          const isl_format format = isl_format_for_gl_format(gl_format);
1094          const gen_device_info *devinfo = bld.shader->devinfo;
1095
1096          /* Transform the image coordinates into actual surface coordinates. */
1097          const fs_reg saddr =
1098             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1099          const unsigned dims =
1100             num_image_coordinates(bld, surf_dims, arr_dims, format);
1101
1102          if (gl_format == GL_NONE) {
1103             /* We don't know what the format is, but that's fine because it
1104              * implies write-only access, and typed surface writes are always
1105              * able to take care of type conversion and packing for us.
1106              */
1107             emit_typed_write(bld, image, saddr, src, dims, 4);
1108
1109          } else {
1110             const isl_format lower_format =
1111                isl_lower_storage_image_format(devinfo, format);
1112             fs_reg tmp = src;
1113
1114             if (!is_conversion_trivial(devinfo, format)) {
1115                /* Do the right sort of type conversion. */
1116                if (isl_format_has_float_channel(format))
1117                   tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1118
1119                else if (isl_format_has_int_channel(format))
1120                   tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1121                                                 isl_format_has_sint_channel(format));
1122
1123                else
1124                   tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1125                                                isl_format_has_snorm_channel(format));
1126             }
1127
1128             /* We're down to bit manipulation at this point. */
1129             tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1130
1131             if (!has_supported_bit_layout(devinfo, format)) {
1132                /* Pack the vector components into a bitfield if the hardware
1133                 * is unable to do it for us.
1134                 */
1135                if (has_split_bit_layout(devinfo, format))
1136                   tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1137                                     get_bit_widths(lower_format));
1138
1139                else
1140                   tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1141                                   get_bit_widths(format));
1142             }
1143
1144             if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1145                /* Hopefully we get here most of the time... */
1146                emit_typed_write(bld, image, saddr, tmp, dims,
1147                                 isl_format_get_num_channels(lower_format));
1148
1149             } else {
1150                /* Untyped surface writes store 32 bits of the surface per
1151                 * component, without any sort of packing or type conversion,
1152                 */
1153                const unsigned size = isl_format_get_layout(format)->bpb / 32;
1154
1155                /* they don't properly handle out of bounds access, so we have
1156                 * to check manually if the coordinates are valid and predicate
1157                 * the surface write on the result,
1158                 */
1159                const brw_predicate pred =
1160                   emit_untyped_image_check(bld, image,
1161                                            emit_bounds_check(bld, image,
1162                                                              saddr, dims));
1163
1164                /* and, phew, they don't know about surface coordinates, we
1165                 * need to convert them to a raw memory offset.
1166                 */
1167                const fs_reg laddr = emit_address_calculation(
1168                   bld, image, saddr, dims);
1169
1170                emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1171             }
1172          }
1173       }
1174
1175       /**
1176        * Perform an atomic read-modify-write operation in a surface of the
1177        * given dimensionality at the given coordinates.  \p surf_dims and \p
1178        * arr_dims give the number of non-array and array coordinates of the
1179        * image respectively.  Main building block of the imageAtomic GLSL
1180        * built-ins.
1181        */
1182       fs_reg
1183       emit_image_atomic(const fs_builder &bld,
1184                         const fs_reg &image, const fs_reg &addr,
1185                         const fs_reg &src0, const fs_reg &src1,
1186                         unsigned surf_dims, unsigned arr_dims,
1187                         unsigned rsize, unsigned op)
1188       {
1189          using namespace image_validity;
1190          using namespace image_coordinates;
1191          using namespace surface_access;
1192          /* Avoid performing an atomic operation on an unbound surface. */
1193          const brw_predicate pred = emit_typed_atomic_check(bld, image);
1194
1195          /* Transform the image coordinates into actual surface coordinates. */
1196          const fs_reg saddr =
1197             emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1198                                    ISL_FORMAT_R32_UINT);
1199          const unsigned dims =
1200             num_image_coordinates(bld, surf_dims, arr_dims,
1201                                   ISL_FORMAT_R32_UINT);
1202
1203          /* Thankfully we can do without untyped atomics here. */
1204          const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1205                                               dims, rsize, op, pred);
1206
1207          /* An unbound surface access should give zero as result. */
1208          if (rsize && pred)
1209             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1210
1211          return retype(tmp, src0.type);
1212       }
1213    }
1214 }