src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp

   1 /*
   2  * Copyright © 2013-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_fs_surface_builder.h"
  25 #include "brw_fs.h"
  26
  27 using namespace brw;
  28
  29 namespace brw {
  30    namespace surface_access {
  31       namespace {
  32          /**
  33           * Generate a logical send opcode for a surface message and return
  34           * the result.
  35           */
  36          fs_reg
  37          emit_send(const fs_builder &bld, enum opcode opcode,
  38                    const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
  39                    unsigned dims, unsigned arg, unsigned rsize,
  40                    brw_predicate pred = BRW_PREDICATE_NONE)
  41          {
  42             /* Reduce the dynamically uniform surface index to a single
  43              * scalar.
  44              */
  45             const fs_reg usurface = bld.emit_uniformize(surface);
  46             const fs_reg srcs[] = {
  47                addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
  48             };
  49             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
  50             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
  51
  52             inst->regs_written = rsize * bld.dispatch_width() / 8;
  53             inst->predicate = pred;
  54             return dst;
  55          }
  56       }
  57
  58       /**
  59        * Emit an untyped surface read opcode.  \p dims determines the number
  60        * of components of the address and \p size the number of components of
  61        * the returned value.
  62        */
  63       fs_reg
  64       emit_untyped_read(const fs_builder &bld,
  65                         const fs_reg &surface, const fs_reg &addr,
  66                         unsigned dims, unsigned size,
  67                         brw_predicate pred)
  68       {
  69          return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
  70                           addr, fs_reg(), surface, dims, size, size, pred);
  71       }
  72
  73       /**
  74        * Emit an untyped surface write opcode.  \p dims determines the number
  75        * of components of the address and \p size the number of components of
  76        * the argument.
  77        */
  78       void
  79       emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
  80                          const fs_reg &addr, const fs_reg &src,
  81                          unsigned dims, unsigned size,
  82                          brw_predicate pred)
  83       {
  84          emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
  85                    addr, src, surface, dims, size, 0, pred);
  86       }
  87
  88       /**
  89        * Emit an untyped surface atomic opcode.  \p dims determines the number
  90        * of components of the address and \p rsize the number of components of
  91        * the returned value (either zero or one).
  92        */
  93       fs_reg
  94       emit_untyped_atomic(const fs_builder &bld,
  95                           const fs_reg &surface, const fs_reg &addr,
  96                           const fs_reg &src0, const fs_reg &src1,
  97                           unsigned dims, unsigned rsize, unsigned op,
  98                           brw_predicate pred)
  99       {
 100          /* FINISHME: Factor out this frequently recurring pattern into a
 101           * helper function.
 102           */
 103          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
 104          const fs_reg srcs[] = { src0, src1 };
 105          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
 106          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
 107
 108          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
 109                           addr, tmp, surface, dims, op, rsize, pred);
 110       }
 111
 112       /**
 113        * Emit a typed surface read opcode.  \p dims determines the number of
 114        * components of the address and \p size the number of components of the
 115        * returned value.
 116        */
 117       fs_reg
 118       emit_typed_read(const fs_builder &bld, const fs_reg &surface,
 119                       const fs_reg &addr, unsigned dims, unsigned size)
 120       {
 121          return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
 122                           addr, fs_reg(), surface, dims, size, size);
 123       }
 124
 125       /**
 126        * Emit a typed surface write opcode.  \p dims determines the number of
 127        * components of the address and \p size the number of components of the
 128        * argument.
 129        */
 130       void
 131       emit_typed_write(const fs_builder &bld, const fs_reg &surface,
 132                        const fs_reg &addr, const fs_reg &src,
 133                        unsigned dims, unsigned size)
 134       {
 135          emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
 136                    addr, src, surface, dims, size, 0);
 137       }
 138
 139       /**
 140        * Emit a typed surface atomic opcode.  \p dims determines the number of
 141        * components of the address and \p rsize the number of components of
 142        * the returned value (either zero or one).
 143        */
 144       fs_reg
 145       emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
 146                         const fs_reg &addr,
 147                         const fs_reg &src0, const fs_reg &src1,
 148                         unsigned dims, unsigned rsize, unsigned op,
 149                         brw_predicate pred)
 150       {
 151          /* FINISHME: Factor out this frequently recurring pattern into a
 152           * helper function.
 153           */
 154          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
 155          const fs_reg srcs[] = { src0, src1 };
 156          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
 157          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
 158
 159          return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
 160                           addr, tmp, surface, dims, op, rsize);
 161       }
 162    }
 163 }
 164
 165 namespace {
 166    namespace image_format_info {
 167       /**
 168        * Simple 4-tuple of scalars used to pass around per-color component
 169        * values.
 170        */
 171       struct color_u {
 172          color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
 173          {
 174          }
 175
 176          color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
 177             r(r), g(g), b(b), a(a)
 178          {
 179          }
 180
 181          unsigned
 182          operator[](unsigned i) const
 183          {
 184             const unsigned xs[] = { r, g, b, a };
 185             return xs[i];
 186          }
 187
 188          unsigned r, g, b, a;
 189       };
 190
 191       /**
 192        * Return the per-channel bitfield widths for a given image format.
 193        */
 194       inline color_u
 195       get_bit_widths(mesa_format format)
 196       {
 197          return color_u(_mesa_get_format_bits(format, GL_RED_BITS),
 198                         _mesa_get_format_bits(format, GL_GREEN_BITS),
 199                         _mesa_get_format_bits(format, GL_BLUE_BITS),
 200                         _mesa_get_format_bits(format, GL_ALPHA_BITS));
 201       }
 202
 203       /**
 204        * Return the per-channel bitfield shifts for a given image format.
 205        */
 206       inline color_u
 207       get_bit_shifts(mesa_format format)
 208       {
 209          const color_u widths = get_bit_widths(format);
 210          return color_u(0, widths.r, widths.r + widths.g,
 211                         widths.r + widths.g + widths.b);
 212       }
 213
 214       /**
 215        * Return true if all present components have the same bit width.
 216        */
 217       inline bool
 218       is_homogeneous(mesa_format format)
 219       {
 220          const color_u widths = get_bit_widths(format);
 221          return ((widths.g == 0 || widths.g == widths.r) &&
 222                  (widths.b == 0 || widths.b == widths.r) &&
 223                  (widths.a == 0 || widths.a == widths.r));
 224       }
 225
 226       /**
 227        * Return true if the format conversion boils down to a trivial copy.
 228        */
 229       inline bool
 230       is_conversion_trivial(const brw_device_info *devinfo, mesa_format format)
 231       {
 232          return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
 233                  format == brw_lower_mesa_image_format(devinfo, format);
 234       }
 235
 236       /**
 237        * Return true if the hardware natively supports some format with
 238        * compatible bitfield layout, but possibly different data types.
 239        */
 240       inline bool
 241       has_supported_bit_layout(const brw_device_info *devinfo,
 242                                mesa_format format)
 243       {
 244          const color_u widths = get_bit_widths(format);
 245          const color_u lower_widths = get_bit_widths(
 246             brw_lower_mesa_image_format(devinfo, format));
 247
 248          return (widths.r == lower_widths.r &&
 249                  widths.g == lower_widths.g &&
 250                  widths.b == lower_widths.b &&
 251                  widths.a == lower_widths.a);
 252       }
 253
 254       /**
 255        * Return true if we are required to spread individual components over
 256        * several components of the format used by the hardware (RG32 and
 257        * friends implemented as RGBA16UI).
 258        */
 259       inline bool
 260       has_split_bit_layout(const brw_device_info *devinfo, mesa_format format)
 261       {
 262          const mesa_format lower_format =
 263             brw_lower_mesa_image_format(devinfo, format);
 264
 265          return (_mesa_format_num_components(format) <
 266                  _mesa_format_num_components(lower_format));
 267       }
 268
 269       /**
 270        * Return true unless we have to fall back to untyped surface access.
 271        * Fail!
 272        */
 273       inline bool
 274       has_matching_typed_format(const brw_device_info *devinfo,
 275                                 mesa_format format)
 276       {
 277          return (_mesa_get_format_bytes(format) <= 4 ||
 278                  (_mesa_get_format_bytes(format) <= 8 &&
 279                   (devinfo->gen >= 8 || devinfo->is_haswell)) ||
 280                  devinfo->gen >= 9);
 281       }
 282
 283       /**
 284        * Return true if the hardware returns garbage in the unused high bits
 285        * of each component.  This may happen on IVB because we rely on the
 286        * undocumented behavior that typed reads from surfaces of the
 287        * unsupported R8 and R16 formats return useful data in their least
 288        * significant bits.
 289        */
 290       inline bool
 291       has_undefined_high_bits(const brw_device_info *devinfo,
 292                               mesa_format format)
 293       {
 294          const mesa_format lower_format =
 295             brw_lower_mesa_image_format(devinfo, format);
 296
 297          return (devinfo->gen == 7 && !devinfo->is_haswell &&
 298                  (lower_format == MESA_FORMAT_R_UINT16 ||
 299                   lower_format == MESA_FORMAT_R_UINT8));
 300       }
 301
 302       /**
 303        * Return true if the format represents values as signed integers
 304        * requiring sign extension when unpacking.
 305        */
 306       inline bool
 307       needs_sign_extension(mesa_format format)
 308       {
 309          return (_mesa_get_format_datatype(format) == GL_SIGNED_NORMALIZED ||
 310                  _mesa_get_format_datatype(format) == GL_INT);
 311       }
 312    }
 313
 314    namespace image_validity {
 315       /**
 316        * Check whether the bound image is suitable for untyped access.
 317        */
 318       brw_predicate
 319       emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
 320                                brw_predicate pred)
 321       {
 322          const brw_device_info *devinfo = bld.shader->devinfo;
 323          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
 324
 325          if (devinfo->gen == 7 && !devinfo->is_haswell) {
 326             /* Check whether the first stride component (i.e. the Bpp value)
 327              * is greater than four, what on Gen7 indicates that a surface of
 328              * type RAW has been bound for untyped access.  Reading or writing
 329              * to a surface of type other than RAW using untyped surface
 330              * messages causes a hang on IVB and VLV.
 331              */
 332             set_predicate(pred,
 333                           bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
 334                                   BRW_CONDITIONAL_G));
 335
 336             return BRW_PREDICATE_NORMAL;
 337          } else {
 338             /* More recent generations handle the format mismatch
 339              * gracefully.
 340              */
 341             return pred;
 342          }
 343       }
 344
 345       /**
 346        * Check whether there is an image bound at the given index and write
 347        * the comparison result to f0.0.  Returns an appropriate predication
 348        * mode to use on subsequent image operations.
 349        */
 350       brw_predicate
 351       emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
 352       {
 353          const brw_device_info *devinfo = bld.shader->devinfo;
 354          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
 355
 356          if (devinfo->gen == 7 && !devinfo->is_haswell) {
 357             /* Check the first component of the size field to find out if the
 358              * image is bound.  Necessary on IVB for typed atomics because
 359              * they don't seem to respect null surfaces and will happily
 360              * corrupt or read random memory when no image is bound.
 361              */
 362             bld.CMP(bld.null_reg_ud(),
 363                     retype(size, BRW_REGISTER_TYPE_UD),
 364                     brw_imm_d(0), BRW_CONDITIONAL_NZ);
 365
 366             return BRW_PREDICATE_NORMAL;
 367          } else {
 368             /* More recent platforms implement compliant behavior when a null
 369              * surface is bound.
 370              */
 371             return BRW_PREDICATE_NONE;
 372          }
 373       }
 374
 375       /**
 376        * Check whether the provided coordinates are within the image bounds
 377        * and write the comparison result to f0.0.  Returns an appropriate
 378        * predication mode to use on subsequent image operations.
 379        */
 380       brw_predicate
 381       emit_bounds_check(const fs_builder &bld, const fs_reg &image,
 382                         const fs_reg &addr, unsigned dims)
 383       {
 384          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
 385
 386          for (unsigned c = 0; c < dims; ++c)
 387             set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
 388                           bld.CMP(bld.null_reg_ud(),
 389                                   offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
 390                                   offset(size, bld, c),
 391                                   BRW_CONDITIONAL_L));
 392
 393          return BRW_PREDICATE_NORMAL;
 394       }
 395    }
 396
 397    namespace image_coordinates {
 398       /**
 399        * Return the total number of coordinates needed to address a texel of
 400        * the surface, which may be more than the sum of \p surf_dims and \p
 401        * arr_dims if padding is required.
 402        */
 403       unsigned
 404       num_image_coordinates(const fs_builder &bld,
 405                             unsigned surf_dims, unsigned arr_dims,
 406                             mesa_format format)
 407       {
 408          /* HSW in vec4 mode and our software coordinate handling for untyped
 409           * reads want the array index to be at the Z component.
 410           */
 411          const bool array_index_at_z =
 412             !image_format_info::has_matching_typed_format(
 413                bld.shader->devinfo, format);
 414          const unsigned zero_dims =
 415             ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
 416
 417          return surf_dims + zero_dims + arr_dims;
 418       }
 419
 420       /**
 421        * Transform image coordinates into the form expected by the
 422        * implementation.
 423        */
 424       fs_reg
 425       emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
 426                              unsigned surf_dims, unsigned arr_dims,
 427                              mesa_format format)
 428       {
 429          const unsigned dims =
 430             num_image_coordinates(bld, surf_dims, arr_dims, format);
 431
 432          if (dims > surf_dims + arr_dims) {
 433             assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
 434             /* The array index is required to be passed in as the Z component,
 435              * insert a zero at the Y component to shift it to the right
 436              * position.
 437              *
 438              * FINISHME: Factor out this frequently recurring pattern into a
 439              * helper function.
 440              */
 441             const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
 442             const fs_reg dst = bld.vgrf(addr.type, dims);
 443             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
 444             return dst;
 445          } else {
 446             return addr;
 447          }
 448       }
 449
 450       /**
 451        * Calculate the offset in memory of the texel given by \p coord.
 452        *
 453        * This is meant to be used with untyped surface messages to access a
 454        * tiled surface, what involves taking into account the tiling and
 455        * swizzling modes of the surface manually so it will hopefully not
 456        * happen very often.
 457        *
 458        * The tiling algorithm implemented here matches either the X or Y
 459        * tiling layouts supported by the hardware depending on the tiling
 460        * coefficients passed to the program as uniforms.  See Volume 1 Part 2
 461        * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
 462        * explanation of the hardware tiling format.
 463        */
 464       fs_reg
 465       emit_address_calculation(const fs_builder &bld, const fs_reg &image,
 466                                const fs_reg &coord, unsigned dims)
 467       {
 468          const brw_device_info *devinfo = bld.shader->devinfo;
 469          const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
 470          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
 471          const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
 472          const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
 473          const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 474          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 475          const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 476          const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
 477          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
 478
 479          /* Shift the coordinates by the fixed surface offset.  It may be
 480           * non-zero if the image is a single slice of a higher-dimensional
 481           * surface, or if a non-zero mipmap level of the surface is bound to
 482           * the pipeline.  The offset needs to be applied here rather than at
 483           * surface state set-up time because the desired slice-level may
 484           * start mid-tile, so simply shifting the surface base address
 485           * wouldn't give a well-formed tiled surface in the general case.
 486           */
 487          for (unsigned c = 0; c < 2; ++c)
 488             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
 489                     (c < dims ?
 490                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
 491                      fs_reg(brw_imm_d(0))));
 492
 493          /* The layout of 3-D textures in memory is sort-of like a tiling
 494           * format.  At each miplevel, the slices are arranged in rows of
 495           * 2^level slices per row.  The slice row is stored in tmp.y and
 496           * the slice within the row is stored in tmp.x.
 497           *
 498           * The layout of 2-D array textures and cubemaps is much simpler:
 499           * Depending on whether the ARYSPC_LOD0 layout is in use it will be
 500           * stored in memory as an array of slices, each one being a 2-D
 501           * arrangement of miplevels, or as a 2D arrangement of miplevels,
 502           * each one being an array of slices.  In either case the separation
 503           * between slices of the same LOD is equal to the qpitch value
 504           * provided as stride.w.
 505           *
 506           * This code can be made to handle either 2D arrays and 3D textures
 507           * by passing in the miplevel as tile.z for 3-D textures and 0 in
 508           * tile.z for 2-D array textures.
 509           *
 510           * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
 511           * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
 512           * of the hardware 3D texture and 2D array layouts.
 513           */
 514          if (dims > 2) {
 515             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
 516              * index.
 517              */
 518             bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
 519                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
 520             bld.SHR(offset(tmp, bld, 1),
 521                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
 522                     offset(tile, bld, 2));
 523
 524             /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
 525              * slice offset.
 526              */
 527             for (unsigned c = 0; c < 2; ++c) {
 528                bld.MUL(offset(tmp, bld, c),
 529                        offset(stride, bld, 2 + c), offset(tmp, bld, c));
 530                bld.ADD(offset(addr, bld, c),
 531                        offset(addr, bld, c), offset(tmp, bld, c));
 532             }
 533          }
 534
 535          if (dims > 1) {
 536             /* Calculate the major/minor x and y indices.  In order to
 537              * accommodate both X and Y tiling, the Y-major tiling format is
 538              * treated as being a bunch of narrow X-tiles placed next to each
 539              * other.  This means that the tile width for Y-tiling is actually
 540              * the width of one sub-column of the Y-major tile where each 4K
 541              * tile has 8 512B sub-columns.
 542              *
 543              * The major Y value is the row of tiles in which the pixel lives.
 544              * The major X value is the tile sub-column in which the pixel
 545              * lives; for X tiling, this is the same as the tile column, for Y
 546              * tiling, each tile has 8 sub-columns.  The minor X and Y indices
 547              * are the position within the sub-column.
 548              */
 549             for (unsigned c = 0; c < 2; ++c) {
 550                /* Calculate the minor x and y indices. */
 551                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
 552                        brw_imm_d(0), offset(addr, bld, c));
 553
 554                /* Calculate the major x and y indices. */
 555                bld.SHR(offset(major, bld, c),
 556                        offset(addr, bld, c), offset(tile, bld, c));
 557             }
 558
 559             /* Calculate the texel index from the start of the tile row and
 560              * the vertical coordinate of the row.
 561              * Equivalent to:
 562              *   tmp.x = (major.x << tile.y << tile.x) +
 563              *           (minor.y << tile.x) + minor.x
 564              *   tmp.y = major.y << tile.y
 565              */
 566             bld.SHL(tmp, major, offset(tile, bld, 1));
 567             bld.ADD(tmp, tmp, offset(minor, bld, 1));
 568             bld.SHL(tmp, tmp, offset(tile, bld, 0));
 569             bld.ADD(tmp, tmp, minor);
 570             bld.SHL(offset(tmp, bld, 1),
 571                     offset(major, bld, 1), offset(tile, bld, 1));
 572
 573             /* Add it to the start of the tile row. */
 574             bld.MUL(offset(tmp, bld, 1),
 575                     offset(tmp, bld, 1), offset(stride, bld, 1));
 576             bld.ADD(tmp, tmp, offset(tmp, bld, 1));
 577
 578             /* Multiply by the Bpp value. */
 579             bld.MUL(dst, tmp, stride);
 580
 581             if (devinfo->gen < 8 && !devinfo->is_baytrail) {
 582                /* Take into account the two dynamically specified shifts.
 583                 * Both need are used to implement swizzling of X-tiled
 584                 * surfaces.  For Y-tiled surfaces only one bit needs to be
 585                 * XOR-ed with bit 6 of the memory address, so a swz value of
 586                 * 0xff (actually interpreted as 31 by the hardware) will be
 587                 * provided to cause the relevant bit of tmp.y to be zero and
 588                 * turn the first XOR into the identity.  For linear surfaces
 589                 * or platforms lacking address swizzling both shifts will be
 590                 * 0xff causing the relevant bits of both tmp.x and .y to be
 591                 * zero, what effectively disables swizzling.
 592                 */
 593                for (unsigned c = 0; c < 2; ++c)
 594                   bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
 595
 596                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
 597                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
 598                bld.AND(tmp, tmp, brw_imm_d(1 << 6));
 599                bld.XOR(dst, dst, tmp);
 600             }
 601
 602          } else {
 603             /* Multiply by the Bpp/stride value.  Note that the addr.y may be
 604              * non-zero even if the image is one-dimensional because a
 605              * vertical offset may have been applied above to select a
 606              * non-zero slice or level of a higher-dimensional texture.
 607              */
 608             bld.MUL(offset(addr, bld, 1),
 609                     offset(addr, bld, 1), offset(stride, bld, 1));
 610             bld.ADD(addr, addr, offset(addr, bld, 1));
 611             bld.MUL(dst, addr, stride);
 612          }
 613
 614          return dst;
 615       }
 616    }
 617
 618    namespace image_format_conversion {
 619       using image_format_info::color_u;
 620
 621       namespace {
 622          /**
 623           * Maximum representable value in an unsigned integer with the given
 624           * number of bits.
 625           */
 626          inline unsigned
 627          scale(unsigned n)
 628          {
 629             return (1 << n) - 1;
 630          }
 631       }
 632
 633       /**
 634        * Pack the vector \p src in a bitfield given the per-component bit
 635        * shifts and widths.  Note that bitfield components are not allowed to
 636        * cross 32-bit boundaries.
 637        */
 638       fs_reg
 639       emit_pack(const fs_builder &bld, const fs_reg &src,
 640                 const color_u &shifts, const color_u &widths)
 641       {
 642          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 643          bool seen[4] = {};
 644
 645          for (unsigned c = 0; c < 4; ++c) {
 646             if (widths[c]) {
 647                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
 648
 649                /* Shift each component left to the correct bitfield position. */
 650                bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
 651
 652                /* Add everything up. */
 653                if (seen[shifts[c] / 32]) {
 654                   bld.OR(offset(dst, bld, shifts[c] / 32),
 655                          offset(dst, bld, shifts[c] / 32), tmp);
 656                } else {
 657                   bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
 658                   seen[shifts[c] / 32] = true;
 659                }
 660             }
 661          }
 662
 663          return dst;
 664       }
 665
 666       /**
 667        * Unpack a vector from the bitfield \p src given the per-component bit
 668        * shifts and widths.  Note that bitfield components are not allowed to
 669        * cross 32-bit boundaries.
 670        */
 671       fs_reg
 672       emit_unpack(const fs_builder &bld, const fs_reg &src,
 673                   const color_u &shifts, const color_u &widths)
 674       {
 675          const fs_reg dst = bld.vgrf(src.type, 4);
 676
 677          for (unsigned c = 0; c < 4; ++c) {
 678             if (widths[c]) {
 679                /* Shift left to discard the most significant bits. */
 680                bld.SHL(offset(dst, bld, c),
 681                        offset(src, bld, shifts[c] / 32),
 682                        brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
 683
 684                /* Shift back to the least significant bits using an arithmetic
 685                 * shift to get sign extension on signed types.
 686                 */
 687                bld.ASR(offset(dst, bld, c),
 688                        offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
 689             }
 690          }
 691
 692          return dst;
 693       }
 694
 695       /**
 696        * Convert an integer vector into another integer vector of the
 697        * specified bit widths, properly handling overflow.
 698        */
 699       fs_reg
 700       emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
 701                               const color_u &widths, bool is_signed)
 702       {
 703          const unsigned s = (is_signed ? 1 : 0);
 704          const fs_reg dst = bld.vgrf(
 705             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
 706          assert(src.type == dst.type);
 707
 708          for (unsigned c = 0; c < 4; ++c) {
 709             if (widths[c]) {
 710                /* Clamp to the maximum value. */
 711                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
 712                                brw_imm_d((int)scale(widths[c] - s)),
 713                                BRW_CONDITIONAL_L);
 714
 715                /* Clamp to the minimum value. */
 716                if (is_signed)
 717                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
 718                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
 719                                   BRW_CONDITIONAL_GE);
 720
 721                /* Mask off all but the bits we actually want.  Otherwise, if
 722                 * we pass a negative number into the hardware when it's
 723                 * expecting something like UINT8, it will happily clamp it to
 724                 * +255 for us.
 725                 */
 726                if (is_signed && widths[c] < 32)
 727                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
 728                           brw_imm_d((1 << widths[c]) - 1));
 729             }
 730          }
 731
 732          return dst;
 733       }
 734
 735       /**
 736        * Convert a normalized fixed-point vector of the specified signedness
 737        * and bit widths into a floating point vector.
 738        */
 739       fs_reg
 740       emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
 741                                const color_u &widths, bool is_signed)
 742       {
 743          const unsigned s = (is_signed ? 1 : 0);
 744          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
 745
 746          for (unsigned c = 0; c < 4; ++c) {
 747             if (widths[c]) {
 748                /* Convert to float. */
 749                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
 750
 751                /* Divide by the normalization constants. */
 752                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
 753                        brw_imm_f(1.0f / scale(widths[c] - s)));
 754
 755                /* Clamp to the minimum value. */
 756                if (is_signed)
 757                   bld.emit_minmax(offset(dst, bld, c),
 758                                   offset(dst, bld, c), brw_imm_f(-1.0f),
 759                                   BRW_CONDITIONAL_GE);
 760             }
 761          }
 762          return dst;
 763       }
 764
 765       /**
 766        * Convert a floating-point vector into a normalized fixed-point vector
 767        * of the specified signedness and bit widths.
 768        */
 769       fs_reg
 770       emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
 771                              const color_u &widths, bool is_signed)
 772       {
 773          const unsigned s = (is_signed ? 1 : 0);
 774          const fs_reg dst = bld.vgrf(
 775             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
 776          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 777
 778          for (unsigned c = 0; c < 4; ++c) {
 779             if (widths[c]) {
 780                /* Clamp the normalized floating-point argument. */
 781                if (is_signed) {
 782                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
 783                                   brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
 784
 785                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
 786                                   brw_imm_f(1.0f), BRW_CONDITIONAL_L);
 787                } else {
 788                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
 789                                              offset(src, bld, c)));
 790                }
 791
 792                /* Multiply by the normalization constants. */
 793                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
 794                        brw_imm_f((float)scale(widths[c] - s)));
 795
 796                /* Convert to integer. */
 797                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
 798                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
 799
 800                /* Mask off all but the bits we actually want.  Otherwise, if
 801                 * we pass a negative number into the hardware when it's
 802                 * expecting something like UINT8, it will happily clamp it to
 803                 * +255 for us.
 804                 */
 805                if (is_signed && widths[c] < 32)
 806                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
 807                           brw_imm_d((1 << widths[c]) - 1));
 808             }
 809          }
 810
 811          return dst;
 812       }
 813
 814       /**
 815        * Convert a floating point vector of the specified bit widths into a
 816        * 32-bit floating point vector.
 817        */
 818       fs_reg
 819       emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
 820                               const color_u &widths)
 821       {
 822          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 823          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 824
 825          for (unsigned c = 0; c < 4; ++c) {
 826             if (widths[c]) {
 827                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
 828
 829                /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
 830                 * This works because they have a 5-bit exponent just like the
 831                 * 16-bit floating point format, and they have no sign bit.
 832                 */
 833                if (widths[c] < 16)
 834                   bld.SHL(offset(dst, bld, c),
 835                           offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
 836
 837                /* Convert to 32-bit floating point. */
 838                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
 839             }
 840          }
 841
 842          return fdst;
 843       }
 844
 845       /**
 846        * Convert a vector into a floating point vector of the specified bit
 847        * widths.
 848        */
 849       fs_reg
 850       emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
 851                             const color_u &widths)
 852       {
 853          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
 854          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
 855
 856          for (unsigned c = 0; c < 4; ++c) {
 857             if (widths[c]) {
 858                bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
 859
 860                /* Clamp to the minimum value. */
 861                if (widths[c] < 16)
 862                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
 863                                   brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
 864
 865                /* Convert to 16-bit floating-point. */
 866                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
 867
 868                /* Discard the least significant bits to get floating point
 869                 * numbers of the requested width.  This works because the
 870                 * 10-bit and 11-bit floating point formats have a 5-bit
 871                 * exponent just like the 16-bit format, and they have no sign
 872                 * bit.
 873                 */
 874                if (widths[c] < 16)
 875                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
 876                           brw_imm_ud(15 - widths[c]));
 877             }
 878          }
 879
 880          return dst;
 881       }
 882
 883       /**
 884        * Fill missing components of a vector with 0, 0, 0, 1.
 885        */
 886       fs_reg
 887       emit_pad(const fs_builder &bld, const fs_reg &src,
 888                const color_u &widths)
 889       {
 890          const fs_reg dst = bld.vgrf(src.type, 4);
 891          const unsigned pad[] = { 0, 0, 0, 1 };
 892
 893          for (unsigned c = 0; c < 4; ++c)
 894             bld.MOV(offset(dst, bld, c),
 895                     widths[c] ? offset(src, bld, c)
 896                               : fs_reg(brw_imm_ud(pad[c])));
 897
 898          return dst;
 899       }
 900    }
 901 }
 902
 903 namespace brw {
 904    namespace image_access {
 905       /**
 906        * Load a vector from a surface of the given format and dimensionality
 907        * at the given coordinates.  \p surf_dims and \p arr_dims give the
 908        * number of non-array and array coordinates of the image respectively.
 909        */
 910       fs_reg
 911       emit_image_load(const fs_builder &bld,
 912                       const fs_reg &image, const fs_reg &addr,
 913                       unsigned surf_dims, unsigned arr_dims,
 914                       mesa_format format)
 915       {
 916          using namespace image_format_info;
 917          using namespace image_format_conversion;
 918          using namespace image_validity;
 919          using namespace image_coordinates;
 920          using namespace surface_access;
 921          const brw_device_info *devinfo = bld.shader->devinfo;
 922          const mesa_format lower_format =
 923             brw_lower_mesa_image_format(devinfo, format);
 924          fs_reg tmp;
 925
 926          /* Transform the image coordinates into actual surface coordinates. */
 927          const fs_reg saddr =
 928             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
 929          const unsigned dims =
 930             num_image_coordinates(bld, surf_dims, arr_dims, format);
 931
 932          if (has_matching_typed_format(devinfo, format)) {
 933             /* Hopefully we get here most of the time... */
 934             tmp = emit_typed_read(bld, image, saddr, dims,
 935                                   _mesa_format_num_components(lower_format));
 936          } else {
 937             /* Untyped surface reads return 32 bits of the surface per
 938              * component, without any sort of unpacking or type conversion,
 939              */
 940             const unsigned size = _mesa_get_format_bytes(format) / 4;
 941
 942             /* they don't properly handle out of bounds access, so we have to
 943              * check manually if the coordinates are valid and predicate the
 944              * surface read on the result,
 945              */
 946             const brw_predicate pred =
 947                emit_untyped_image_check(bld, image,
 948                                         emit_bounds_check(bld, image,
 949                                                           saddr, dims));
 950
 951             /* and they don't know about surface coordinates, we need to
 952              * convert them to a raw memory offset.
 953              */
 954             const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
 955
 956             tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
 957
 958             /* An out of bounds surface access should give zero as result. */
 959             for (unsigned c = 0; c < size; ++c)
 960                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
 961                                            offset(tmp, bld, c), brw_imm_d(0)));
 962          }
 963
 964          /* Set the register type to D instead of UD if the data type is
 965           * represented as a signed integer in memory so that sign extension
 966           * is handled correctly by unpack.
 967           */
 968          if (needs_sign_extension(format))
 969             tmp = retype(tmp, BRW_REGISTER_TYPE_D);
 970
 971          if (!has_supported_bit_layout(devinfo, format)) {
 972             /* Unpack individual vector components from the bitfield if the
 973              * hardware is unable to do it for us.
 974              */
 975             if (has_split_bit_layout(devinfo, format))
 976                tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
 977                                get_bit_widths(lower_format));
 978             else
 979                tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
 980                                  get_bit_widths(format));
 981
 982          } else if ((needs_sign_extension(format) &&
 983                      !is_conversion_trivial(devinfo, format)) ||
 984                     has_undefined_high_bits(devinfo, format)) {
 985             /* Perform a trivial unpack even though the bit layout matches in
 986              * order to get the most significant bits of each component
 987              * initialized properly.
 988              */
 989             tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
 990                               get_bit_widths(format));
 991          }
 992
 993          if (!_mesa_is_format_integer(format)) {
 994             if (is_conversion_trivial(devinfo, format)) {
 995                /* Just need to cast the vector to the target type. */
 996                tmp = retype(tmp, BRW_REGISTER_TYPE_F);
 997             } else {
 998                /* Do the right sort of type conversion to float. */
 999                if (_mesa_get_format_datatype(format) == GL_FLOAT)
1000                   tmp = emit_convert_from_float(
1001                      bld, tmp, get_bit_widths(format));
1002                else
1003                   tmp = emit_convert_from_scaled(
1004                      bld, tmp, get_bit_widths(format),
1005                      _mesa_is_format_signed(format));
1006             }
1007          }
1008
1009          /* Initialize missing components of the result. */
1010          return emit_pad(bld, tmp, get_bit_widths(format));
1011       }
1012
1013       /**
1014        * Store a vector in a surface of the given format and dimensionality at
1015        * the given coordinates.  \p surf_dims and \p arr_dims give the number
1016        * of non-array and array coordinates of the image respectively.
1017        */
1018       void
1019       emit_image_store(const fs_builder &bld, const fs_reg &image,
1020                        const fs_reg &addr, const fs_reg &src,
1021                        unsigned surf_dims, unsigned arr_dims,
1022                        mesa_format format)
1023       {
1024          using namespace image_format_info;
1025          using namespace image_format_conversion;
1026          using namespace image_validity;
1027          using namespace image_coordinates;
1028          using namespace surface_access;
1029          const brw_device_info *devinfo = bld.shader->devinfo;
1030
1031          /* Transform the image coordinates into actual surface coordinates. */
1032          const fs_reg saddr =
1033             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1034          const unsigned dims =
1035             num_image_coordinates(bld, surf_dims, arr_dims, format);
1036
1037          if (format == MESA_FORMAT_NONE) {
1038             /* We don't know what the format is, but that's fine because it
1039              * implies write-only access, and typed surface writes are always
1040              * able to take care of type conversion and packing for us.
1041              */
1042             emit_typed_write(bld, image, saddr, src, dims, 4);
1043
1044          } else {
1045             const mesa_format lower_format =
1046                brw_lower_mesa_image_format(devinfo, format);
1047             fs_reg tmp = src;
1048
1049             if (!is_conversion_trivial(devinfo, format)) {
1050                /* Do the right sort of type conversion. */
1051                if (_mesa_get_format_datatype(format) == GL_FLOAT)
1052                   tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1053
1054                else if (_mesa_is_format_integer(format))
1055                   tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1056                                                 _mesa_is_format_signed(format));
1057
1058                else
1059                   tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1060                                                _mesa_is_format_signed(format));
1061             }
1062
1063             /* We're down to bit manipulation at this point. */
1064             tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1065
1066             if (!has_supported_bit_layout(devinfo, format)) {
1067                /* Pack the vector components into a bitfield if the hardware
1068                 * is unable to do it for us.
1069                 */
1070                if (has_split_bit_layout(devinfo, format))
1071                   tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1072                                     get_bit_widths(lower_format));
1073
1074                else
1075                   tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1076                                   get_bit_widths(format));
1077             }
1078
1079             if (has_matching_typed_format(devinfo, format)) {
1080                /* Hopefully we get here most of the time... */
1081                emit_typed_write(bld, image, saddr, tmp, dims,
1082                                 _mesa_format_num_components(lower_format));
1083
1084             } else {
1085                /* Untyped surface writes store 32 bits of the surface per
1086                 * component, without any sort of packing or type conversion,
1087                 */
1088                const unsigned size = _mesa_get_format_bytes(format) / 4;
1089
1090                /* they don't properly handle out of bounds access, so we have
1091                 * to check manually if the coordinates are valid and predicate
1092                 * the surface write on the result,
1093                 */
1094                const brw_predicate pred =
1095                   emit_untyped_image_check(bld, image,
1096                                            emit_bounds_check(bld, image,
1097                                                              saddr, dims));
1098
1099                /* and, phew, they don't know about surface coordinates, we
1100                 * need to convert them to a raw memory offset.
1101                 */
1102                const fs_reg laddr = emit_address_calculation(
1103                   bld, image, saddr, dims);
1104
1105                emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1106             }
1107          }
1108       }
1109
1110       /**
1111        * Perform an atomic read-modify-write operation in a surface of the
1112        * given dimensionality at the given coordinates.  \p surf_dims and \p
1113        * arr_dims give the number of non-array and array coordinates of the
1114        * image respectively.  Main building block of the imageAtomic GLSL
1115        * built-ins.
1116        */
1117       fs_reg
1118       emit_image_atomic(const fs_builder &bld,
1119                         const fs_reg &image, const fs_reg &addr,
1120                         const fs_reg &src0, const fs_reg &src1,
1121                         unsigned surf_dims, unsigned arr_dims,
1122                         unsigned rsize, unsigned op)
1123       {
1124          using namespace image_validity;
1125          using namespace image_coordinates;
1126          using namespace surface_access;
1127          /* Avoid performing an atomic operation on an unbound surface. */
1128          const brw_predicate pred = emit_typed_atomic_check(bld, image);
1129
1130          /* Transform the image coordinates into actual surface coordinates. */
1131          const fs_reg saddr =
1132             emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1133                                   MESA_FORMAT_R_UINT32);
1134          const unsigned dims =
1135             num_image_coordinates(bld, surf_dims, arr_dims,
1136                                   MESA_FORMAT_R_UINT32);
1137
1138          /* Thankfully we can do without untyped atomics here. */
1139          const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1140                                               dims, rsize, op, pred);
1141
1142          /* An unbound surface access should give zero as result. */
1143          if (rsize && pred)
1144             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1145
1146          return tmp;
1147       }
1148    }
1149 }