src/intel/compiler/brw_fs_surface_builder.cpp

   1 /*
   2  * Copyright © 2013-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "isl/isl.h"
  25 #include "brw_fs_surface_builder.h"
  26 #include "brw_fs.h"
  27
  28 using namespace brw;
  29
  30 namespace brw {
  31    namespace surface_access {
  32       namespace {
  33          /**
  34           * Generate a logical send opcode for a surface message and return
  35           * the result.
  36           */
  37          fs_reg
  38          emit_send(const fs_builder &bld, enum opcode opcode,
  39                    const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
  40                    unsigned dims, unsigned arg, unsigned rsize,
  41                    brw_predicate pred = BRW_PREDICATE_NONE)
  42          {
  43             /* Reduce the dynamically uniform surface index to a single
  44              * scalar.
  45              */
  46             const fs_reg usurface = bld.emit_uniformize(surface);
  47             const fs_reg srcs[] = {
  48                addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
  49             };
  50             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
  51             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
  52
  53             inst->size_written = rsize * dst.component_size(inst->exec_size);
  54             inst->predicate = pred;
  55             return dst;
  56          }
  57       }
  58
  59       /**
  60        * Emit an untyped surface read opcode.  \p dims determines the number
  61        * of components of the address and \p size the number of components of
  62        * the returned value.
  63        */
  64       fs_reg
  65       emit_untyped_read(const fs_builder &bld,
  66                         const fs_reg &surface, const fs_reg &addr,
  67                         unsigned dims, unsigned size,
  68                         brw_predicate pred)
  69       {
  70          return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
  71                           addr, fs_reg(), surface, dims, size, size, pred);
  72       }
  73
  74       /**
  75        * Emit an untyped surface write opcode.  \p dims determines the number
  76        * of components of the address and \p size the number of components of
  77        * the argument.
  78        */
  79       void
  80       emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
  81                          const fs_reg &addr, const fs_reg &src,
  82                          unsigned dims, unsigned size,
  83                          brw_predicate pred)
  84       {
  85          emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
  86                    addr, src, surface, dims, size, 0, pred);
  87       }
  88
  89       /**
  90        * Emit an untyped surface atomic opcode.  \p dims determines the number
  91        * of components of the address and \p rsize the number of components of
  92        * the returned value (either zero or one).
  93        */
  94       fs_reg
  95       emit_untyped_atomic(const fs_builder &bld,
  96                           const fs_reg &surface, const fs_reg &addr,
  97                           const fs_reg &src0, const fs_reg &src1,
  98                           unsigned dims, unsigned rsize, unsigned op,
  99                           brw_predicate pred)
 100       {
 101          /* FINISHME: Factor out this frequently recurring pattern into a
 102           * helper function.
 103           */
 104          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
 105          const fs_reg srcs[] = { src0, src1 };
 106          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
 107          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
 108
 109          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
 110                           addr, tmp, surface, dims, op, rsize, pred);
 111       }
 112
 113       /**
 114        * Emit an untyped surface atomic float opcode.  \p dims determines the
 115        * number of components of the address and \p rsize the number of
 116        * components of the returned value (either zero or one).
 117        */
 118       fs_reg
 119       emit_untyped_atomic_float(const fs_builder &bld,
 120                                 const fs_reg &surface, const fs_reg &addr,
 121                                 const fs_reg &src0, const fs_reg &src1,
 122                                 unsigned dims, unsigned rsize, unsigned op,
 123                                 brw_predicate pred)
 124       {
 125          /* FINISHME: Factor out this frequently recurring pattern into a
 126           * helper function.
 127           */
 128          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
 129          const fs_reg srcs[] = { src0, src1 };
 130          const fs_reg tmp = bld.vgrf(src0.type, n);
 131          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
 132
 133          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
 134                           addr, tmp, surface, dims, op, rsize, pred);
 135       }
 136
 137       /**
 138        * Emit a typed surface read opcode.  \p dims determines the number of
 139        * components of the address and \p size the number of components of the
 140        * returned value.
 141        */
 142       fs_reg
 143       emit_typed_read(const fs_builder &bld, const fs_reg &surface,
 144                       const fs_reg &addr, unsigned dims, unsigned size)
 145       {
 146          return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
 147                           addr, fs_reg(), surface, dims, size, size);
 148       }
 149
 150       /**
 151        * Emit a typed surface write opcode.  \p dims determines the number of
 152        * components of the address and \p size the number of components of the
 153        * argument.
 154        */
 155       void
 156       emit_typed_write(const fs_builder &bld, const fs_reg &surface,
 157                        const fs_reg &addr, const fs_reg &src,
 158                        unsigned dims, unsigned size)
 159       {
 160          emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
 161                    addr, src, surface, dims, size, 0);
 162       }
 163
 164       /**
 165        * Emit a typed surface atomic opcode.  \p dims determines the number of
 166        * components of the address and \p rsize the number of components of
 167        * the returned value (either zero or one).
 168        */
 169       fs_reg
 170       emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
 171                         const fs_reg &addr,
 172                         const fs_reg &src0, const fs_reg &src1,
 173                         unsigned dims, unsigned rsize, unsigned op,
 174                         brw_predicate pred)
 175       {
 176          /* FINISHME: Factor out this frequently recurring pattern into a
 177           * helper function.
 178           */
 179          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
 180          const fs_reg srcs[] = { src0, src1 };
 181          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
 182          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
 183
 184          return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
 185                           addr, tmp, surface, dims, op, rsize);
 186       }
 187
 188       fs_reg
 189       emit_byte_scattered_read(const fs_builder &bld,
 190                                const fs_reg &surface, const fs_reg &addr,
 191                                unsigned dims, unsigned size,
 192                                unsigned bit_size, brw_predicate pred)
 193       {
 194          return emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
 195                           addr, fs_reg(), surface, dims, bit_size, size, pred);
 196       }
 197
 198       void
 199       emit_byte_scattered_write(const fs_builder &bld, const fs_reg &surface,
 200                                 const fs_reg &addr, const fs_reg &src,
 201                                 unsigned dims,
 202                                 unsigned bit_size, brw_predicate pred)
 203       {
 204          emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
 205                    addr, src, surface, dims, bit_size, 0, pred);
 206       }
 207    }
 208 }
 209
 210 namespace {
 211    namespace image_format_info {
 212       /* The higher compiler layers use the GL enums for image formats even if
 213        * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
 214        * enum before we can use them.
 215        */
 216       static enum isl_format
 217       isl_format_for_gl_format(uint32_t gl_format)
 218       {
 219          switch (gl_format) {
 220          case GL_R8:             return ISL_FORMAT_R8_UNORM;
 221          case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
 222          case GL_R8UI:           return ISL_FORMAT_R8_UINT;
 223          case GL_R8I:            return ISL_FORMAT_R8_SINT;
 224          case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
 225          case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
 226          case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
 227          case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
 228          case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
 229          case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
 230          case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
 231          case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
 232          case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
 233          case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
 234          case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
 235          case GL_R16:            return ISL_FORMAT_R16_UNORM;
 236          case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
 237          case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
 238          case GL_R16UI:          return ISL_FORMAT_R16_UINT;
 239          case GL_R16I:           return ISL_FORMAT_R16_SINT;
 240          case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
 241          case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
 242          case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
 243          case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
 244          case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
 245          case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
 246          case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
 247          case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
 248          case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
 249          case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
 250          case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
 251          case GL_R32UI:          return ISL_FORMAT_R32_UINT;
 252          case GL_R32I:           return ISL_FORMAT_R32_SINT;
 253          case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
 254          case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
 255          case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
 256          case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
 257          case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
 258          case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
 259          case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
 260          default:
 261             assert(!"Invalid image format");
 262             return ISL_FORMAT_UNSUPPORTED;
 263          }
 264       }
 265
 266       /**
 267        * Simple 4-tuple of scalars used to pass around per-color component
 268        * values.
 269        */
 270       struct color_u {
 271          color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
 272          {
 273          }
 274
 275          color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
 276             r(r), g(g), b(b), a(a)
 277          {
 278          }
 279
 280          unsigned
 281          operator[](unsigned i) const
 282          {
 283             const unsigned xs[] = { r, g, b, a };
 284             return xs[i];
 285          }
 286
 287          unsigned r, g, b, a;
 288       };
 289
 290       /**
 291        * Return the per-channel bitfield widths for a given image format.
 292        */
 293       inline color_u
 294       get_bit_widths(isl_format format)
 295       {
 296          const isl_format_layout *fmtl = isl_format_get_layout(format);
 297
 298          return color_u(fmtl->channels.r.bits,
 299                         fmtl->channels.g.bits,
 300                         fmtl->channels.b.bits,
 301                         fmtl->channels.a.bits);
 302       }
 303
 304       /**
 305        * Return the per-channel bitfield shifts for a given image format.
 306        */
 307       inline color_u
 308       get_bit_shifts(isl_format format)
 309       {
 310          const color_u widths = get_bit_widths(format);
 311          return color_u(0, widths.r, widths.r + widths.g,
 312                         widths.r + widths.g + widths.b);
 313       }
 314
 315       /**
 316        * Return true if all present components have the same bit width.
 317        */
 318       inline bool
 319       is_homogeneous(isl_format format)
 320       {
 321          const color_u widths = get_bit_widths(format);
 322          return ((widths.g == 0 || widths.g == widths.r) &&
 323                  (widths.b == 0 || widths.b == widths.r) &&
 324                  (widths.a == 0 || widths.a == widths.r));
 325       }
 326
 327       /**
 328        * Return true if the format conversion boils down to a trivial copy.
 329        */
 330       inline bool
 331       is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
 332       {
 333          return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
 334                  format == isl_lower_storage_image_format(devinfo, format);
 335       }
 336
 337       /**
 338        * Return true if the hardware natively supports some format with
 339        * compatible bitfield layout, but possibly different data types.
 340        */
 341       inline bool
 342       has_supported_bit_layout(const gen_device_info *devinfo,
 343                                isl_format format)
 344       {
 345          const color_u widths = get_bit_widths(format);
 346          const color_u lower_widths = get_bit_widths(
 347             isl_lower_storage_image_format(devinfo, format));
 348
 349          return (widths.r == lower_widths.r &&
 350                  widths.g == lower_widths.g &&
 351                  widths.b == lower_widths.b &&
 352                  widths.a == lower_widths.a);
 353       }
 354
 355       /**
 356        * Return true if we are required to spread individual components over
 357        * several components of the format used by the hardware (RG32 and
 358        * friends implemented as RGBA16UI).
 359        */
 360       inline bool
 361       has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
 362       {
 363          const isl_format lower_format =
 364             isl_lower_storage_image_format(devinfo, format);
 365
 366          return (isl_format_get_num_channels(format) <
 367                  isl_format_get_num_channels(lower_format));
 368       }
 369
 370       /**
 371        * Return true if the hardware returns garbage in the unused high bits
 372        * of each component.  This may happen on IVB because we rely on the
 373        * undocumented behavior that typed reads from surfaces of the
 374        * unsupported R8 and R16 formats return useful data in their least
 375        * significant bits.
 376        */
 377       inline bool
 378       has_undefined_high_bits(const gen_device_info *devinfo,
 379                               isl_format format)
 380       {
 381          const isl_format lower_format =
 382             isl_lower_storage_image_format(devinfo, format);
 383
 384          return (devinfo->gen == 7 && !devinfo->is_haswell &&
 385                  (lower_format == ISL_FORMAT_R16_UINT ||
 386                   lower_format == ISL_FORMAT_R8_UINT));
 387       }
 388
 389       /**
 390        * Return true if the format represents values as signed integers
 391        * requiring sign extension when unpacking.
 392        */
 393       inline bool
 394       needs_sign_extension(isl_format format)
 395       {
 396          return isl_format_has_snorm_channel(format) ||
 397                 isl_format_has_sint_channel(format);
 398       }
 399    }
 400
 401    namespace image_validity {
 402       /**
 403        * Check whether the bound image is suitable for untyped access.
 404        */
 405       static brw_predicate
 406       emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
 407                                brw_predicate pred)
 408       {
 409          const gen_device_info *devinfo = bld.shader->devinfo;
 410          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
 411
 412          if (devinfo->gen == 7 && !devinfo->is_haswell) {
 413             /* Check whether the first stride component (i.e. the Bpp value)
 414              * is greater than four, what on Gen7 indicates that a surface of
 415              * type RAW has been bound for untyped access.  Reading or writing
 416              * to a surface of type other than RAW using untyped surface
 417              * messages causes a hang on IVB and VLV.
 418              */
 419             set_predicate(pred,
 420                           bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
 421                                   BRW_CONDITIONAL_G));
 422
 423             return BRW_PREDICATE_NORMAL;
 424          } else {
 425             /* More recent generations handle the format mismatch
 426              * gracefully.
 427              */
 428             return pred;
 429          }
 430       }
 431
 432       /**
 433        * Check whether there is an image bound at the given index and write
 434        * the comparison result to f0.0.  Returns an appropriate predication
 435        * mode to use on subsequent image operations.
 436        */
 437       static brw_predicate
 438       emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
 439       {
 440          const gen_device_info *devinfo = bld.shader->devinfo;
 441          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
 442
 443          if (devinfo->gen == 7 && !devinfo->is_haswell) {
 444             /* Check the first component of the size field to find out if the
 445              * image is bound.  Necessary on IVB for typed atomics because
 446              * they don't seem to respect null surfaces and will happily
 447              * corrupt or read random memory when no image is bound.
 448              */
 449             bld.CMP(bld.null_reg_ud(),
 450                     retype(size, BRW_REGISTER_TYPE_UD),
 451                     brw_imm_d(0), BRW_CONDITIONAL_NZ);
 452
 453             return BRW_PREDICATE_NORMAL;
 454          } else {
 455             /* More recent platforms implement compliant behavior when a null
 456              * surface is bound.
 457              */
 458             return BRW_PREDICATE_NONE;
 459          }
 460       }
 461
 462       /**
 463        * Check whether the provided coordinates are within the image bounds
 464        * and write the comparison result to f0.0.  Returns an appropriate
 465        * predication mode to use on subsequent image operations.
 466        */
 467       static brw_predicate
 468       emit_bounds_check(const fs_builder &bld, const fs_reg &image,
 469                         const fs_reg &addr, unsigned dims)
 470       {
 471          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
 472
 473          for (unsigned c = 0; c < dims; ++c)
 474             set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
 475                           bld.CMP(bld.null_reg_ud(),
 476                                   offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
 477                                   offset(size, bld, c),
 478                                   BRW_CONDITIONAL_L));
 479
 480          return BRW_PREDICATE_NORMAL;
 481       }
 482    }
 483
 484    namespace image_coordinates {
 485       /**
 486        * Return the total number of coordinates needed to address a texel of
 487        * the surface, which may be more than the sum of \p surf_dims and \p
 488        * arr_dims if padding is required.
 489        */
 490       static unsigned
 491       num_image_coordinates(const fs_builder &bld,
 492                             unsigned surf_dims, unsigned arr_dims,
 493                             isl_format format)
 494       {
 495          /* HSW in vec4 mode and our software coordinate handling for untyped
 496           * reads want the array index to be at the Z component.
 497           */
 498          const bool array_index_at_z =
 499             format != ISL_FORMAT_UNSUPPORTED &&
 500             !isl_has_matching_typed_storage_image_format(
 501                bld.shader->devinfo, format);
 502          const unsigned zero_dims =
 503             ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
 504
 505          return surf_dims + zero_dims + arr_dims;
 506       }
 507
 508       /**
 509        * Transform image coordinates into the form expected by the
 510        * implementation.
 511        */
 512       static fs_reg
 513       emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
 514                              unsigned surf_dims, unsigned arr_dims,
 515                              isl_format format)
 516       {
 517          const unsigned dims =
 518             num_image_coordinates(bld, surf_dims, arr_dims, format);
 519
 520          if (dims > surf_dims + arr_dims) {
 521             assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
 522             /* The array index is required to be passed in as the Z component,
 523              * insert a zero at the Y component to shift it to the right
 524              * position.
 525              *
 526              * FINISHME: Factor out this frequently recurring pattern into a
 527              * helper function.
 528              */
 529             const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
 530             const fs_reg dst = bld.vgrf(addr.type, dims);
 531             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
 532             return dst;
 533          } else {
 534             return addr;
 535          }
 536       }
 537
 538       /**
 539        * Calculate the offset in memory of the texel given by \p coord.
 540        *
 541        * This is meant to be used with untyped surface messages to access a
 542        * tiled surface, what involves taking into account the tiling and
 543        * swizzling modes of the surface manually so it will hopefully not
 544        * happen very often.
 545        *
 546        * The tiling algorithm implemented here matches either the X or Y
 547        * tiling layouts supported by the hardware depending on the tiling
 548        * coefficients passed to the program as uniforms.  See Volume 1 Part 2
 549        * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
 550        * explanation of the hardware tiling format.
 551        */
 552       static fs_reg
 553       emit_address_calculation(const fs_builder &bld, const fs_reg &image,
 554                                const fs_reg &coord, unsigned dims)
 555       {
 556          const gen_device_info *devinfo = bld.shader->devinfo;
 557          const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
 558          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
 559          const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
 560          const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
 561          const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 562          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 563          const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 564          const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 565          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
 566
 567          /* Shift the coordinates by the fixed surface offset.  It may be
 568           * non-zero if the image is a single slice of a higher-dimensional
 569           * surface, or if a non-zero mipmap level of the surface is bound to
 570           * the pipeline.  The offset needs to be applied here rather than at
 571           * surface state set-up time because the desired slice-level may
 572           * start mid-tile, so simply shifting the surface base address
 573           * wouldn't give a well-formed tiled surface in the general case.
 574           */
 575          for (unsigned c = 0; c < 2; ++c)
 576             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
 577                     (c < dims ?
 578                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
 579                      fs_reg(brw_imm_d(0))));
 580
 581          /* The layout of 3-D textures in memory is sort-of like a tiling
 582           * format.  At each miplevel, the slices are arranged in rows of
 583           * 2^level slices per row.  The slice row is stored in tmp.y and
 584           * the slice within the row is stored in tmp.x.
 585           *
 586           * The layout of 2-D array textures and cubemaps is much simpler:
 587           * Depending on whether the ARYSPC_LOD0 layout is in use it will be
 588           * stored in memory as an array of slices, each one being a 2-D
 589           * arrangement of miplevels, or as a 2D arrangement of miplevels,
 590           * each one being an array of slices.  In either case the separation
 591           * between slices of the same LOD is equal to the qpitch value
 592           * provided as stride.w.
 593           *
 594           * This code can be made to handle either 2D arrays and 3D textures
 595           * by passing in the miplevel as tile.z for 3-D textures and 0 in
 596           * tile.z for 2-D array textures.
 597           *
 598           * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
 599           * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
 600           * of the hardware 3D texture and 2D array layouts.
 601           */
 602          if (dims > 2) {
 603             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
 604              * index.
 605              */
 606             bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
 607                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
 608             bld.SHR(offset(tmp, bld, 1),
 609                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
 610                     offset(tile, bld, 2));
 611
 612             /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
 613              * slice offset.
 614              */
 615             for (unsigned c = 0; c < 2; ++c) {
 616                bld.MUL(offset(tmp, bld, c),
 617                        offset(stride, bld, 2 + c), offset(tmp, bld, c));
 618                bld.ADD(offset(addr, bld, c),
 619                        offset(addr, bld, c), offset(tmp, bld, c));
 620             }
 621          }
 622
 623          if (dims > 1) {
 624             /* Calculate the major/minor x and y indices.  In order to
 625              * accommodate both X and Y tiling, the Y-major tiling format is
 626              * treated as being a bunch of narrow X-tiles placed next to each
 627              * other.  This means that the tile width for Y-tiling is actually
 628              * the width of one sub-column of the Y-major tile where each 4K
 629              * tile has 8 512B sub-columns.
 630              *
 631              * The major Y value is the row of tiles in which the pixel lives.
 632              * The major X value is the tile sub-column in which the pixel
 633              * lives; for X tiling, this is the same as the tile column, for Y
 634              * tiling, each tile has 8 sub-columns.  The minor X and Y indices
 635              * are the position within the sub-column.
 636              */
 637             for (unsigned c = 0; c < 2; ++c) {
 638                /* Calculate the minor x and y indices. */
 639                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
 640                        brw_imm_d(0), offset(addr, bld, c));
 641
 642                /* Calculate the major x and y indices. */
 643                bld.SHR(offset(major, bld, c),
 644                        offset(addr, bld, c), offset(tile, bld, c));
 645             }
 646
 647             /* Calculate the texel index from the start of the tile row and
 648              * the vertical coordinate of the row.
 649              * Equivalent to:
 650              *   tmp.x = (major.x << tile.y << tile.x) +
 651              *           (minor.y << tile.x) + minor.x
 652              *   tmp.y = major.y << tile.y
 653              */
 654             bld.SHL(tmp, major, offset(tile, bld, 1));
 655             bld.ADD(tmp, tmp, offset(minor, bld, 1));
 656             bld.SHL(tmp, tmp, offset(tile, bld, 0));
 657             bld.ADD(tmp, tmp, minor);
 658             bld.SHL(offset(tmp, bld, 1),
 659                     offset(major, bld, 1), offset(tile, bld, 1));
 660
 661             /* Add it to the start of the tile row. */
 662             bld.MUL(offset(tmp, bld, 1),
 663                     offset(tmp, bld, 1), offset(stride, bld, 1));
 664             bld.ADD(tmp, tmp, offset(tmp, bld, 1));
 665
 666             /* Multiply by the Bpp value. */
 667             bld.MUL(dst, tmp, stride);
 668
 669             if (devinfo->gen < 8 && !devinfo->is_baytrail) {
 670                /* Take into account the two dynamically specified shifts.
 671                 * Both need are used to implement swizzling of X-tiled
 672                 * surfaces.  For Y-tiled surfaces only one bit needs to be
 673                 * XOR-ed with bit 6 of the memory address, so a swz value of
 674                 * 0xff (actually interpreted as 31 by the hardware) will be
 675                 * provided to cause the relevant bit of tmp.y to be zero and
 676                 * turn the first XOR into the identity.  For linear surfaces
 677                 * or platforms lacking address swizzling both shifts will be
 678                 * 0xff causing the relevant bits of both tmp.x and .y to be
 679                 * zero, what effectively disables swizzling.
 680                 */
 681                for (unsigned c = 0; c < 2; ++c)
 682                   bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
 683
 684                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
 685                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
 686                bld.AND(tmp, tmp, brw_imm_d(1 << 6));
 687                bld.XOR(dst, dst, tmp);
 688             }
 689
 690          } else {
 691             /* Multiply by the Bpp/stride value.  Note that the addr.y may be
 692              * non-zero even if the image is one-dimensional because a
 693              * vertical offset may have been applied above to select a
 694              * non-zero slice or level of a higher-dimensional texture.
 695              */
 696             bld.MUL(offset(addr, bld, 1),
 697                     offset(addr, bld, 1), offset(stride, bld, 1));
 698             bld.ADD(addr, addr, offset(addr, bld, 1));
 699             bld.MUL(dst, addr, stride);
 700          }
 701
 702          return dst;
 703       }
 704    }
 705
 706    namespace image_format_conversion {
 707       using image_format_info::color_u;
 708
 709       namespace {
 710          /**
 711           * Maximum representable value in an unsigned integer with the given
 712           * number of bits.
 713           */
 714          inline unsigned
 715          scale(unsigned n)
 716          {
 717             return (1 << n) - 1;
 718          }
 719       }
 720
 721       /**
 722        * Pack the vector \p src in a bitfield given the per-component bit
 723        * shifts and widths.  Note that bitfield components are not allowed to
 724        * cross 32-bit boundaries.
 725        */
 726       static fs_reg
 727       emit_pack(const fs_builder &bld, const fs_reg &src,
 728                 const color_u &shifts, const color_u &widths)
 729       {
 730          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 731          bool seen[4] = {};
 732
 733          for (unsigned c = 0; c < 4; ++c) {
 734             if (widths[c]) {
 735                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
 736
 737                /* Shift each component left to the correct bitfield position. */
 738                bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
 739
 740                /* Add everything up. */
 741                if (seen[shifts[c] / 32]) {
 742                   bld.OR(offset(dst, bld, shifts[c] / 32),
 743                          offset(dst, bld, shifts[c] / 32), tmp);
 744                } else {
 745                   bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
 746                   seen[shifts[c] / 32] = true;
 747                }
 748             }
 749          }
 750
 751          return dst;
 752       }
 753
 754       /**
 755        * Unpack a vector from the bitfield \p src given the per-component bit
 756        * shifts and widths.  Note that bitfield components are not allowed to
 757        * cross 32-bit boundaries.
 758        */
 759       static fs_reg
 760       emit_unpack(const fs_builder &bld, const fs_reg &src,
 761                   const color_u &shifts, const color_u &widths)
 762       {
 763          const fs_reg dst = bld.vgrf(src.type, 4);
 764
 765          for (unsigned c = 0; c < 4; ++c) {
 766             if (widths[c]) {
 767                /* Shift left to discard the most significant bits. */
 768                bld.SHL(offset(dst, bld, c),
 769                        offset(src, bld, shifts[c] / 32),
 770                        brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
 771
 772                /* Shift back to the least significant bits using an arithmetic
 773                 * shift to get sign extension on signed types.
 774                 */
 775                bld.ASR(offset(dst, bld, c),
 776                        offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
 777             }
 778          }
 779
 780          return dst;
 781       }
 782
 783       /**
 784        * Convert an integer vector into another integer vector of the
 785        * specified bit widths, properly handling overflow.
 786        */
 787       static fs_reg
 788       emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
 789                               const color_u &widths, bool is_signed)
 790       {
 791          const unsigned s = (is_signed ? 1 : 0);
 792          const fs_reg dst = bld.vgrf(
 793             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
 794          assert(src.type == dst.type);
 795
 796          for (unsigned c = 0; c < 4; ++c) {
 797             if (widths[c]) {
 798                /* Clamp to the maximum value. */
 799                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
 800                                brw_imm_d((int)scale(widths[c] - s)),
 801                                BRW_CONDITIONAL_L);
 802
 803                /* Clamp to the minimum value. */
 804                if (is_signed)
 805                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
 806                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
 807                                   BRW_CONDITIONAL_GE);
 808
 809                /* Mask off all but the bits we actually want.  Otherwise, if
 810                 * we pass a negative number into the hardware when it's
 811                 * expecting something like UINT8, it will happily clamp it to
 812                 * +255 for us.
 813                 */
 814                if (is_signed && widths[c] < 32)
 815                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
 816                           brw_imm_d(scale(widths[c])));
 817             }
 818          }
 819
 820          return dst;
 821       }
 822
 823       /**
 824        * Convert a normalized fixed-point vector of the specified signedness
 825        * and bit widths into a floating point vector.
 826        */
 827       static fs_reg
 828       emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
 829                                const color_u &widths, bool is_signed)
 830       {
 831          const unsigned s = (is_signed ? 1 : 0);
 832          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
 833
 834          for (unsigned c = 0; c < 4; ++c) {
 835             if (widths[c]) {
 836                /* Convert to float. */
 837                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
 838
 839                /* Divide by the normalization constants. */
 840                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
 841                        brw_imm_f(1.0f / scale(widths[c] - s)));
 842
 843                /* Clamp to the minimum value. */
 844                if (is_signed)
 845                   bld.emit_minmax(offset(dst, bld, c),
 846                                   offset(dst, bld, c), brw_imm_f(-1.0f),
 847                                   BRW_CONDITIONAL_GE);
 848             }
 849          }
 850          return dst;
 851       }
 852
 853       /**
 854        * Convert a floating-point vector into a normalized fixed-point vector
 855        * of the specified signedness and bit widths.
 856        */
 857       static fs_reg
 858       emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
 859                              const color_u &widths, bool is_signed)
 860       {
 861          const unsigned s = (is_signed ? 1 : 0);
 862          const fs_reg dst = bld.vgrf(
 863             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
 864          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 865
 866          for (unsigned c = 0; c < 4; ++c) {
 867             if (widths[c]) {
 868                /* Clamp the normalized floating-point argument. */
 869                if (is_signed) {
 870                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
 871                                   brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
 872
 873                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
 874                                   brw_imm_f(1.0f), BRW_CONDITIONAL_L);
 875                } else {
 876                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
 877                                              offset(src, bld, c)));
 878                }
 879
 880                /* Multiply by the normalization constants. */
 881                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
 882                        brw_imm_f((float)scale(widths[c] - s)));
 883
 884                /* Convert to integer. */
 885                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
 886                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
 887
 888                /* Mask off all but the bits we actually want.  Otherwise, if
 889                 * we pass a negative number into the hardware when it's
 890                 * expecting something like UINT8, it will happily clamp it to
 891                 * +255 for us.
 892                 */
 893                if (is_signed && widths[c] < 32)
 894                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
 895                           brw_imm_d(scale(widths[c])));
 896             }
 897          }
 898
 899          return dst;
 900       }
 901
 902       /**
 903        * Convert a floating point vector of the specified bit widths into a
 904        * 32-bit floating point vector.
 905        */
 906       static fs_reg
 907       emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
 908                               const color_u &widths)
 909       {
 910          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 911          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 912
 913          for (unsigned c = 0; c < 4; ++c) {
 914             if (widths[c]) {
 915                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
 916
 917                /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
 918                 * This works because they have a 5-bit exponent just like the
 919                 * 16-bit floating point format, and they have no sign bit.
 920                 */
 921                if (widths[c] < 16)
 922                   bld.SHL(offset(dst, bld, c),
 923                           offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
 924
 925                /* Convert to 32-bit floating point. */
 926                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
 927             }
 928          }
 929
 930          return fdst;
 931       }
 932
 933       /**
 934        * Convert a vector into a floating point vector of the specified bit
 935        * widths.
 936        */
 937       static fs_reg
 938       emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
 939                             const color_u &widths)
 940       {
 941          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 942          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 943
 944          for (unsigned c = 0; c < 4; ++c) {
 945             if (widths[c]) {
 946                bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
 947
 948                /* Clamp to the minimum value. */
 949                if (widths[c] < 16)
 950                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
 951                                   brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
 952
 953                /* Convert to 16-bit floating-point. */
 954                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
 955
 956                /* Discard the least significant bits to get floating point
 957                 * numbers of the requested width.  This works because the
 958                 * 10-bit and 11-bit floating point formats have a 5-bit
 959                 * exponent just like the 16-bit format, and they have no sign
 960                 * bit.
 961                 */
 962                if (widths[c] < 16)
 963                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
 964                           brw_imm_ud(15 - widths[c]));
 965             }
 966          }
 967
 968          return dst;
 969       }
 970
 971       /**
 972        * Fill missing components of a vector with 0, 0, 0, 1.
 973        */
 974       static fs_reg
 975       emit_pad(const fs_builder &bld, const fs_reg &src,
 976                const color_u &widths)
 977       {
 978          const fs_reg dst = bld.vgrf(src.type, 4);
 979          const unsigned pad[] = { 0, 0, 0, 1 };
 980
 981          for (unsigned c = 0; c < 4; ++c)
 982             bld.MOV(offset(dst, bld, c),
 983                     widths[c] ? offset(src, bld, c)
 984                               : fs_reg(brw_imm_ud(pad[c])));
 985
 986          return dst;
 987       }
 988    }
 989 }
 990
 991 namespace brw {
 992    namespace image_access {
 993       /**
 994        * Load a vector from a surface of the given format and dimensionality
 995        * at the given coordinates.  \p surf_dims and \p arr_dims give the
 996        * number of non-array and array coordinates of the image respectively.
 997        */
 998       fs_reg
 999       emit_image_load(const fs_builder &bld,
1000                       const fs_reg &image, const fs_reg &addr,
1001                       unsigned surf_dims, unsigned arr_dims,
1002                       unsigned gl_format)
1003       {
1004          using namespace image_format_info;
1005          using namespace image_format_conversion;
1006          using namespace image_validity;
1007          using namespace image_coordinates;
1008          using namespace surface_access;
1009          const gen_device_info *devinfo = bld.shader->devinfo;
1010          const isl_format format = isl_format_for_gl_format(gl_format);
1011          const isl_format lower_format =
1012             isl_lower_storage_image_format(devinfo, format);
1013          fs_reg tmp;
1014
1015          /* Transform the image coordinates into actual surface coordinates. */
1016          const fs_reg saddr =
1017             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1018          const unsigned dims =
1019             num_image_coordinates(bld, surf_dims, arr_dims, format);
1020
1021          if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1022             /* Hopefully we get here most of the time... */
1023             tmp = emit_typed_read(bld, image, saddr, dims,
1024                                   isl_format_get_num_channels(lower_format));
1025          } else {
1026             /* Untyped surface reads return 32 bits of the surface per
1027              * component, without any sort of unpacking or type conversion,
1028              */
1029             const unsigned size = isl_format_get_layout(format)->bpb / 32;
1030             /* they don't properly handle out of bounds access, so we have to
1031              * check manually if the coordinates are valid and predicate the
1032              * surface read on the result,
1033              */
1034             const brw_predicate pred =
1035                emit_untyped_image_check(bld, image,
1036                                         emit_bounds_check(bld, image,
1037                                                           saddr, dims));
1038
1039             /* and they don't know about surface coordinates, we need to
1040              * convert them to a raw memory offset.
1041              */
1042             const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
1043
1044             tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
1045
1046             /* An out of bounds surface access should give zero as result. */
1047             for (unsigned c = 0; c < size; ++c)
1048                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
1049                                            offset(tmp, bld, c), brw_imm_d(0)));
1050          }
1051
1052          /* Set the register type to D instead of UD if the data type is
1053           * represented as a signed integer in memory so that sign extension
1054           * is handled correctly by unpack.
1055           */
1056          if (needs_sign_extension(format))
1057             tmp = retype(tmp, BRW_REGISTER_TYPE_D);
1058
1059          if (!has_supported_bit_layout(devinfo, format)) {
1060             /* Unpack individual vector components from the bitfield if the
1061              * hardware is unable to do it for us.
1062              */
1063             if (has_split_bit_layout(devinfo, format))
1064                tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
1065                                get_bit_widths(lower_format));
1066             else
1067                tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
1068                                  get_bit_widths(format));
1069
1070          } else if ((needs_sign_extension(format) &&
1071                      !is_conversion_trivial(devinfo, format)) ||
1072                     has_undefined_high_bits(devinfo, format)) {
1073             /* Perform a trivial unpack even though the bit layout matches in
1074              * order to get the most significant bits of each component
1075              * initialized properly.
1076              */
1077             tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
1078                               get_bit_widths(format));
1079          }
1080
1081          if (!isl_format_has_int_channel(format)) {
1082             if (is_conversion_trivial(devinfo, format)) {
1083                /* Just need to cast the vector to the target type. */
1084                tmp = retype(tmp, BRW_REGISTER_TYPE_F);
1085             } else {
1086                /* Do the right sort of type conversion to float. */
1087                if (isl_format_has_float_channel(format))
1088                   tmp = emit_convert_from_float(
1089                      bld, tmp, get_bit_widths(format));
1090                else
1091                   tmp = emit_convert_from_scaled(
1092                      bld, tmp, get_bit_widths(format),
1093                      isl_format_has_snorm_channel(format));
1094             }
1095          }
1096
1097          /* Initialize missing components of the result. */
1098          return emit_pad(bld, tmp, get_bit_widths(format));
1099       }
1100
1101       /**
1102        * Store a vector in a surface of the given format and dimensionality at
1103        * the given coordinates.  \p surf_dims and \p arr_dims give the number
1104        * of non-array and array coordinates of the image respectively.
1105        */
1106       void
1107       emit_image_store(const fs_builder &bld, const fs_reg &image,
1108                        const fs_reg &addr, const fs_reg &src,
1109                        unsigned surf_dims, unsigned arr_dims,
1110                        unsigned gl_format)
1111       {
1112          using namespace image_format_info;
1113          using namespace image_format_conversion;
1114          using namespace image_validity;
1115          using namespace image_coordinates;
1116          using namespace surface_access;
1117          const isl_format format = isl_format_for_gl_format(gl_format);
1118          const gen_device_info *devinfo = bld.shader->devinfo;
1119
1120          /* Transform the image coordinates into actual surface coordinates. */
1121          const fs_reg saddr =
1122             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1123          const unsigned dims =
1124             num_image_coordinates(bld, surf_dims, arr_dims, format);
1125
1126          if (gl_format == GL_NONE) {
1127             /* We don't know what the format is, but that's fine because it
1128              * implies write-only access, and typed surface writes are always
1129              * able to take care of type conversion and packing for us.
1130              */
1131             emit_typed_write(bld, image, saddr, src, dims, 4);
1132
1133          } else {
1134             const isl_format lower_format =
1135                isl_lower_storage_image_format(devinfo, format);
1136             fs_reg tmp = src;
1137
1138             if (!is_conversion_trivial(devinfo, format)) {
1139                /* Do the right sort of type conversion. */
1140                if (isl_format_has_float_channel(format))
1141                   tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1142
1143                else if (isl_format_has_int_channel(format))
1144                   tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1145                                                 isl_format_has_sint_channel(format));
1146
1147                else
1148                   tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1149                                                isl_format_has_snorm_channel(format));
1150             }
1151
1152             /* We're down to bit manipulation at this point. */
1153             tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1154
1155             if (!has_supported_bit_layout(devinfo, format)) {
1156                /* Pack the vector components into a bitfield if the hardware
1157                 * is unable to do it for us.
1158                 */
1159                if (has_split_bit_layout(devinfo, format))
1160                   tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1161                                     get_bit_widths(lower_format));
1162
1163                else
1164                   tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1165                                   get_bit_widths(format));
1166             }
1167
1168             if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1169                /* Hopefully we get here most of the time... */
1170                emit_typed_write(bld, image, saddr, tmp, dims,
1171                                 isl_format_get_num_channels(lower_format));
1172
1173             } else {
1174                /* Untyped surface writes store 32 bits of the surface per
1175                 * component, without any sort of packing or type conversion,
1176                 */
1177                const unsigned size = isl_format_get_layout(format)->bpb / 32;
1178
1179                /* they don't properly handle out of bounds access, so we have
1180                 * to check manually if the coordinates are valid and predicate
1181                 * the surface write on the result,
1182                 */
1183                const brw_predicate pred =
1184                   emit_untyped_image_check(bld, image,
1185                                            emit_bounds_check(bld, image,
1186                                                              saddr, dims));
1187
1188                /* and, phew, they don't know about surface coordinates, we
1189                 * need to convert them to a raw memory offset.
1190                 */
1191                const fs_reg laddr = emit_address_calculation(
1192                   bld, image, saddr, dims);
1193
1194                emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1195             }
1196          }
1197       }
1198
1199       /**
1200        * Perform an atomic read-modify-write operation in a surface of the
1201        * given dimensionality at the given coordinates.  \p surf_dims and \p
1202        * arr_dims give the number of non-array and array coordinates of the
1203        * image respectively.  Main building block of the imageAtomic GLSL
1204        * built-ins.
1205        */
1206       fs_reg
1207       emit_image_atomic(const fs_builder &bld,
1208                         const fs_reg &image, const fs_reg &addr,
1209                         const fs_reg &src0, const fs_reg &src1,
1210                         unsigned surf_dims, unsigned arr_dims,
1211                         unsigned rsize, unsigned op)
1212       {
1213          using namespace image_validity;
1214          using namespace image_coordinates;
1215          using namespace surface_access;
1216          /* Avoid performing an atomic operation on an unbound surface. */
1217          const brw_predicate pred = emit_typed_atomic_check(bld, image);
1218
1219          /* Transform the image coordinates into actual surface coordinates. */
1220          const fs_reg saddr =
1221             emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1222                                    ISL_FORMAT_R32_UINT);
1223          const unsigned dims =
1224             num_image_coordinates(bld, surf_dims, arr_dims,
1225                                   ISL_FORMAT_R32_UINT);
1226
1227          /* Thankfully we can do without untyped atomics here. */
1228          const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1229                                               dims, rsize, op, pred);
1230
1231          /* An unbound surface access should give zero as result. */
1232          if (rsize && pred)
1233             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1234
1235          return retype(tmp, src0.type);
1236       }
1237    }
1238 }