intel/compiler: Do image load/store lowering to NIR

author Jason Ekstrand <jason.ekstrand@intel.com>

Sat, 27 Jan 2018 21:19:57 +0000 (13:19 -0800)

committer Jason Ekstrand <jason.ekstrand@intel.com>

Wed, 29 Aug 2018 19:04:02 +0000 (14:04 -0500)
author Jason Ekstrand <jason.ekstrand@intel.com>
Sat, 27 Jan 2018 21:19:57 +0000 (13:19 -0800)
committer Jason Ekstrand <jason.ekstrand@intel.com>
Wed, 29 Aug 2018 19:04:02 +0000 (14:04 -0500)
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py

index 17212c4862f1da775b7277199b76983c338873f0..170f954e3752b7b9416d7cbca1e7a462d52f9ec5 100644 (file)
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -312,6 +312,15 @@ intrinsic("image_deref_atomic_fadd",  src_comp=[1, 4, 1, 1], dest_comp=1)
  intrinsic("image_deref_size",    src_comp=[1], dest_comp=0, flags=[CAN_ELIMINATE, CAN_REORDER])
  intrinsic("image_deref_samples", src_comp=[1], dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER])
  
+# Intel-specific query for loading from the brw_image_param struct passed
+# into the shader as a uniform.  The variable is a deref to the image
+# variable. The const index specifies which of the six parameters to load.
+intrinsic("image_deref_load_param_intel", src_comp=[1], dest_comp=0,
+          indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("image_deref_load_raw_intel", src_comp=[1, 1], dest_comp=0,
+          flags=[CAN_ELIMINATE])
+intrinsic("image_deref_store_raw_intel", src_comp=[1, 1, 0])
+
  # Vulkan descriptor set intrinsics
  #
  # The Vulkan API uses a different binding model from GL.  In the Vulkan
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources

index 5f6cd96825bb962b23bf5a5e5b96c5fd3802b61b..d10c4511734ce38ffd269c020269cd1932ab3ac1 100644 (file)
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -84,6 +84,7 @@ COMPILER_FILES = \
         compiler/brw_nir_analyze_ubo_ranges.c \
         compiler/brw_nir_attribute_workarounds.c \
         compiler/brw_nir_lower_cs_intrinsics.c \
+       compiler/brw_nir_lower_image_load_store.c \
         compiler/brw_nir_opt_peephole_ffma.c \
         compiler/brw_nir_tcs_workarounds.c \
         compiler/brw_packed_float.c \
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index 67c0bee7acdfe66e3cdd36e4db65b5c2a2800663..b2be91f91174f58be338c59062ea8bc04dcb7165 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3865,38 +3865,33 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
     case nir_intrinsic_image_deref_atomic_xor:
     case nir_intrinsic_image_deref_atomic_exchange:
     case nir_intrinsic_image_deref_atomic_comp_swap: {
-      using namespace image_access;
-
        if (stage == MESA_SHADER_FRAGMENT &&
            instr->intrinsic != nir_intrinsic_image_deref_load)
           brw_wm_prog_data(prog_data)->has_side_effects = true;
  
        /* Get the referenced image variable and type. */
        nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
-      const nir_variable *var = nir_deref_instr_get_variable(deref);
-      const glsl_type *type = var->type->without_array();
-      const brw_reg_type base_type = get_image_base_type(type);
+      const glsl_type *type = deref->type;
  
        /* Get some metadata from the image intrinsic. */
        const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
-      const unsigned arr_dims = type->sampler_array ? 1 : 0;
-      const unsigned surf_dims = type->coordinate_components() - arr_dims;
-      const unsigned format = var->data.image.format;
+      const unsigned dims = type->coordinate_components();
        const unsigned dest_components = nir_intrinsic_dest_components(instr);
  
        /* Get the arguments of the image intrinsic. */
        const fs_reg image = get_nir_image_deref(deref);
-      const fs_reg addr = retype(get_nir_src(instr->src[1]),
-                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg coords = retype(get_nir_src(instr->src[1]),
+                                   BRW_REGISTER_TYPE_UD);
        fs_reg tmp;
  
        /* Emit an image load, store or atomic op. */
-      if (instr->intrinsic == nir_intrinsic_image_deref_load)
-         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
-      else if (instr->intrinsic == nir_intrinsic_image_deref_store) {
-         const fs_reg src0 = retype(get_nir_src(instr->src[3]), base_type);
-         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
-                          var->data.image.write_only ? GL_NONE : format);
+      if (instr->intrinsic == nir_intrinsic_image_deref_load) {
+         tmp = emit_typed_read(bld, image, coords, dims,
+                               instr->num_components);
+      } else if (instr->intrinsic == nir_intrinsic_image_deref_store) {
+         const fs_reg src0 = get_nir_src(instr->src[3]);
+         emit_typed_write(bld, image, coords, src0, dims,
+                          instr->num_components);
        } else {
           int op;
           unsigned num_srcs = info->num_srcs;
@@ -3938,25 +3933,61 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           }
  
           const fs_reg src0 = (num_srcs >= 4 ?
-                              retype(get_nir_src(instr->src[3]), base_type) :
-                              fs_reg());
+                              get_nir_src(instr->src[3]) : fs_reg());
           const fs_reg src1 = (num_srcs >= 5 ?
-                              retype(get_nir_src(instr->src[4]), base_type) :
-                              fs_reg());
+                              get_nir_src(instr->src[4]) : fs_reg());
  
-         tmp = emit_image_atomic(bld, image, addr, src0, src1,
-                                 surf_dims, arr_dims, dest_components,
-                                 op);
+         tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1, op);
        }
  
        /* Assign the result. */
        for (unsigned c = 0; c < dest_components; ++c) {
-         bld.MOV(offset(retype(dest, base_type), bld, c),
-               offset(tmp, bld, c));
+         bld.MOV(offset(retype(dest, tmp.type), bld, c),
+                 offset(tmp, bld, c));
+      }
+      break;
+   }
+
+   case nir_intrinsic_image_deref_load_param_intel: {
+      nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+      const fs_reg image = get_nir_image_deref(deref);
+      const fs_reg param = offset(image, bld, nir_intrinsic_base(instr) * 4);
+      for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
+         bld.MOV(offset(retype(dest, param.type), bld, c),
+                 offset(param, bld, c));
+      }
+      break;
+   }
+
+   case nir_intrinsic_image_deref_load_raw_intel: {
+      const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
+      const fs_reg addr = retype(get_nir_src(instr->src[1]),
+                                 BRW_REGISTER_TYPE_UD);
+
+      fs_reg tmp = emit_untyped_read(bld, image, addr, 1,
+                                     instr->num_components);
+
+      for (unsigned c = 0; c < instr->num_components; ++c) {
+         bld.MOV(offset(retype(dest, tmp.type), bld, c),
+                 offset(tmp, bld, c));
        }
        break;
     }
  
+   case nir_intrinsic_image_deref_store_raw_intel: {
+      const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
+      const fs_reg addr = retype(get_nir_src(instr->src[1]),
+                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg data = retype(get_nir_src(instr->src[2]),
+                                 BRW_REGISTER_TYPE_UD);
+
+      brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+      emit_untyped_write(bld, image, addr, data, 1,
+                         instr->num_components);
+      break;
+   }
+
     case nir_intrinsic_group_memory_barrier:
     case nir_intrinsic_memory_barrier_shared:
     case nir_intrinsic_memory_barrier_atomic_counter:
@@ -3979,51 +4010,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        break;
     }
  
-   case nir_intrinsic_image_deref_size: {
-      /* Get the referenced image variable and type. */
-      nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
-      const nir_variable *var = nir_deref_instr_get_variable(deref);
-      const glsl_type *type = var->type->without_array();
-
-      /* Get the size of the image. */
-      const fs_reg image = get_nir_image_deref(deref);
-      const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
-      /* For 1DArray image types, the array index is stored in the Z component.
-       * Fix this by swizzling the Z component to the Y component.
-       */
-      const bool is_1d_array_image =
-                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
-                  type->sampler_array;
-
-      /* For CubeArray images, we should count the number of cubes instead
-       * of the number of faces. Fix it by dividing the (Z component) by 6.
-       */
-      const bool is_cube_array_image =
-                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-                  type->sampler_array;
-
-      /* Copy all the components. */
-      for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
-         if ((int)c >= type->coordinate_components()) {
-             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     brw_imm_d(1));
-         } else if (c == 1 && is_1d_array_image) {
-            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                    offset(size, bld, 2));
-         } else if (c == 2 && is_cube_array_image) {
-            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
-                     offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                     offset(size, bld, c), brw_imm_d(6));
-         } else {
-            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
-                    offset(size, bld, c));
-         }
-       }
-
-      break;
-   }
-
     case nir_intrinsic_image_deref_samples:
        /* The driver does not support multi-sampled images. */
        bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
diff --git a/src/intel/compiler/brw_fs_surface_builder.cpp b/src/intel/compiler/brw_fs_surface_builder.cpp

index 0b8418ca7364cc798e7678ad1298722071f375fa..fed04da5e7a829b444b5936c9d9ca2fdaaf375d7 100644 (file)
--- a/src/intel/compiler/brw_fs_surface_builder.cpp
+++ b/src/intel/compiler/brw_fs_surface_builder.cpp
@@ -206,1033 +206,3 @@ namespace brw {
        }
     }
  }
-
-namespace {
-   namespace image_format_info {
-      /* The higher compiler layers use the GL enums for image formats even if
-       * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
-       * enum before we can use them.
-       */
-      static enum isl_format
-      isl_format_for_gl_format(uint32_t gl_format)
-      {
-         switch (gl_format) {
-         case GL_R8:             return ISL_FORMAT_R8_UNORM;
-         case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
-         case GL_R8UI:           return ISL_FORMAT_R8_UINT;
-         case GL_R8I:            return ISL_FORMAT_R8_SINT;
-         case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
-         case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
-         case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
-         case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
-         case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
-         case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
-         case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
-         case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
-         case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
-         case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
-         case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
-         case GL_R16:            return ISL_FORMAT_R16_UNORM;
-         case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
-         case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
-         case GL_R16UI:          return ISL_FORMAT_R16_UINT;
-         case GL_R16I:           return ISL_FORMAT_R16_SINT;
-         case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
-         case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
-         case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
-         case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
-         case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
-         case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
-         case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
-         case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
-         case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
-         case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
-         case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
-         case GL_R32UI:          return ISL_FORMAT_R32_UINT;
-         case GL_R32I:           return ISL_FORMAT_R32_SINT;
-         case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
-         case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
-         case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
-         case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
-         case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
-         case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
-         case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
-         default:
-            assert(!"Invalid image format");
-            return ISL_FORMAT_UNSUPPORTED;
-         }
-      }
-
-      /**
-       * Simple 4-tuple of scalars used to pass around per-color component
-       * values.
-       */
-      struct color_u {
-         color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
-         {
-         }
-
-         color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
-            r(r), g(g), b(b), a(a)
-         {
-         }
-
-         unsigned
-         operator[](unsigned i) const
-         {
-            const unsigned xs[] = { r, g, b, a };
-            return xs[i];
-         }
-
-         unsigned r, g, b, a;
-      };
-
-      /**
-       * Return the per-channel bitfield widths for a given image format.
-       */
-      inline color_u
-      get_bit_widths(isl_format format)
-      {
-         const isl_format_layout *fmtl = isl_format_get_layout(format);
-
-         return color_u(fmtl->channels.r.bits,
-                        fmtl->channels.g.bits,
-                        fmtl->channels.b.bits,
-                        fmtl->channels.a.bits);
-      }
-
-      /**
-       * Return the per-channel bitfield shifts for a given image format.
-       */
-      inline color_u
-      get_bit_shifts(isl_format format)
-      {
-         const color_u widths = get_bit_widths(format);
-         return color_u(0, widths.r, widths.r + widths.g,
-                        widths.r + widths.g + widths.b);
-      }
-
-      /**
-       * Return true if all present components have the same bit width.
-       */
-      inline bool
-      is_homogeneous(isl_format format)
-      {
-         const color_u widths = get_bit_widths(format);
-         return ((widths.g == 0 || widths.g == widths.r) &&
-                 (widths.b == 0 || widths.b == widths.r) &&
-                 (widths.a == 0 || widths.a == widths.r));
-      }
-
-      /**
-       * Return true if the format conversion boils down to a trivial copy.
-       */
-      inline bool
-      is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
-      {
-         return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
-                 format == isl_lower_storage_image_format(devinfo, format);
-      }
-
-      /**
-       * Return true if the hardware natively supports some format with
-       * compatible bitfield layout, but possibly different data types.
-       */
-      inline bool
-      has_supported_bit_layout(const gen_device_info *devinfo,
-                               isl_format format)
-      {
-         const color_u widths = get_bit_widths(format);
-         const color_u lower_widths = get_bit_widths(
-            isl_lower_storage_image_format(devinfo, format));
-
-         return (widths.r == lower_widths.r &&
-                 widths.g == lower_widths.g &&
-                 widths.b == lower_widths.b &&
-                 widths.a == lower_widths.a);
-      }
-
-      /**
-       * Return true if we are required to spread individual components over
-       * several components of the format used by the hardware (RG32 and
-       * friends implemented as RGBA16UI).
-       */
-      inline bool
-      has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
-      {
-         const isl_format lower_format =
-            isl_lower_storage_image_format(devinfo, format);
-
-         return (isl_format_get_num_channels(format) <
-                 isl_format_get_num_channels(lower_format));
-      }
-
-      /**
-       * Return true if the hardware returns garbage in the unused high bits
-       * of each component.  This may happen on IVB because we rely on the
-       * undocumented behavior that typed reads from surfaces of the
-       * unsupported R8 and R16 formats return useful data in their least
-       * significant bits.
-       */
-      inline bool
-      has_undefined_high_bits(const gen_device_info *devinfo,
-                              isl_format format)
-      {
-         const isl_format lower_format =
-            isl_lower_storage_image_format(devinfo, format);
-
-         return (devinfo->gen == 7 && !devinfo->is_haswell &&
-                 (lower_format == ISL_FORMAT_R16_UINT ||
-                  lower_format == ISL_FORMAT_R8_UINT));
-      }
-
-      /**
-       * Return true if the format represents values as signed integers
-       * requiring sign extension when unpacking.
-       */
-      inline bool
-      needs_sign_extension(isl_format format)
-      {
-         return isl_format_has_snorm_channel(format) ||
-                isl_format_has_sint_channel(format);
-      }
-   }
-
-   namespace image_validity {
-      /**
-       * Check whether the bound image is suitable for untyped access.
-       */
-      static brw_predicate
-      emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
-                               brw_predicate pred)
-      {
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
-
-         if (devinfo->gen == 7 && !devinfo->is_haswell) {
-            /* Check whether the first stride component (i.e. the Bpp value)
-             * is greater than four, what on Gen7 indicates that a surface of
-             * type RAW has been bound for untyped access.  Reading or writing
-             * to a surface of type other than RAW using untyped surface
-             * messages causes a hang on IVB and VLV.
-             */
-            set_predicate(pred,
-                          bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
-                                  BRW_CONDITIONAL_G));
-
-            return BRW_PREDICATE_NORMAL;
-         } else {
-            /* More recent generations handle the format mismatch
-             * gracefully.
-             */
-            return pred;
-         }
-      }
-
-      /**
-       * Check whether there is an image bound at the given index and write
-       * the comparison result to f0.0.  Returns an appropriate predication
-       * mode to use on subsequent image operations.
-       */
-      static brw_predicate
-      emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
-      {
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
-         if (devinfo->gen == 7 && !devinfo->is_haswell) {
-            /* Check the first component of the size field to find out if the
-             * image is bound.  Necessary on IVB for typed atomics because
-             * they don't seem to respect null surfaces and will happily
-             * corrupt or read random memory when no image is bound.
-             */
-            bld.CMP(bld.null_reg_ud(),
-                    retype(size, BRW_REGISTER_TYPE_UD),
-                    brw_imm_d(0), BRW_CONDITIONAL_NZ);
-
-            return BRW_PREDICATE_NORMAL;
-         } else {
-            /* More recent platforms implement compliant behavior when a null
-             * surface is bound.
-             */
-            return BRW_PREDICATE_NONE;
-         }
-      }
-
-      /**
-       * Check whether the provided coordinates are within the image bounds
-       * and write the comparison result to f0.0.  Returns an appropriate
-       * predication mode to use on subsequent image operations.
-       */
-      static brw_predicate
-      emit_bounds_check(const fs_builder &bld, const fs_reg &image,
-                        const fs_reg &addr, unsigned dims)
-      {
-         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
-         for (unsigned c = 0; c < dims; ++c)
-            set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
-                          bld.CMP(bld.null_reg_ud(),
-                                  offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
-                                  offset(size, bld, c),
-                                  BRW_CONDITIONAL_L));
-
-         return BRW_PREDICATE_NORMAL;
-      }
-   }
-
-   namespace image_coordinates {
-      /**
-       * Return the total number of coordinates needed to address a texel of
-       * the surface, which may be more than the sum of \p surf_dims and \p
-       * arr_dims if padding is required.
-       */
-      static unsigned
-      num_image_coordinates(const fs_builder &bld,
-                            unsigned surf_dims, unsigned arr_dims,
-                            isl_format format)
-      {
-         /* HSW in vec4 mode and our software coordinate handling for untyped
-          * reads want the array index to be at the Z component.
-          */
-         const bool array_index_at_z =
-            format != ISL_FORMAT_UNSUPPORTED &&
-            !isl_has_matching_typed_storage_image_format(
-               bld.shader->devinfo, format);
-         const unsigned zero_dims =
-            ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
-
-         return surf_dims + zero_dims + arr_dims;
-      }
-
-      /**
-       * Transform image coordinates into the form expected by the
-       * implementation.
-       */
-      static fs_reg
-      emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
-                             unsigned surf_dims, unsigned arr_dims,
-                             isl_format format)
-      {
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims, format);
-
-         if (dims > surf_dims + arr_dims) {
-            assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
-            /* The array index is required to be passed in as the Z component,
-             * insert a zero at the Y component to shift it to the right
-             * position.
-             *
-             * FINISHME: Factor out this frequently recurring pattern into a
-             * helper function.
-             */
-            const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
-            const fs_reg dst = bld.vgrf(addr.type, dims);
-            bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
-            return dst;
-         } else {
-            return addr;
-         }
-      }
-
-      /**
-       * Calculate the offset in memory of the texel given by \p coord.
-       *
-       * This is meant to be used with untyped surface messages to access a
-       * tiled surface, what involves taking into account the tiling and
-       * swizzling modes of the surface manually so it will hopefully not
-       * happen very often.
-       *
-       * The tiling algorithm implemented here matches either the X or Y
-       * tiling layouts supported by the hardware depending on the tiling
-       * coefficients passed to the program as uniforms.  See Volume 1 Part 2
-       * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
-       * explanation of the hardware tiling format.
-       */
-      static fs_reg
-      emit_address_calculation(const fs_builder &bld, const fs_reg &image,
-                               const fs_reg &coord, unsigned dims)
-      {
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
-         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
-         const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
-         const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
-         const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-         /* Shift the coordinates by the fixed surface offset.  It may be
-          * non-zero if the image is a single slice of a higher-dimensional
-          * surface, or if a non-zero mipmap level of the surface is bound to
-          * the pipeline.  The offset needs to be applied here rather than at
-          * surface state set-up time because the desired slice-level may
-          * start mid-tile, so simply shifting the surface base address
-          * wouldn't give a well-formed tiled surface in the general case.
-          */
-         for (unsigned c = 0; c < 2; ++c)
-            bld.ADD(offset(addr, bld, c), offset(off, bld, c),
-                    (c < dims ?
-                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
-                     fs_reg(brw_imm_d(0))));
-
-         /* The layout of 3-D textures in memory is sort-of like a tiling
-          * format.  At each miplevel, the slices are arranged in rows of
-          * 2^level slices per row.  The slice row is stored in tmp.y and
-          * the slice within the row is stored in tmp.x.
-          *
-          * The layout of 2-D array textures and cubemaps is much simpler:
-          * Depending on whether the ARYSPC_LOD0 layout is in use it will be
-          * stored in memory as an array of slices, each one being a 2-D
-          * arrangement of miplevels, or as a 2D arrangement of miplevels,
-          * each one being an array of slices.  In either case the separation
-          * between slices of the same LOD is equal to the qpitch value
-          * provided as stride.w.
-          *
-          * This code can be made to handle either 2D arrays and 3D textures
-          * by passing in the miplevel as tile.z for 3-D textures and 0 in
-          * tile.z for 2-D array textures.
-          *
-          * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
-          * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
-          * of the hardware 3D texture and 2D array layouts.
-          */
-         if (dims > 2) {
-            /* Decompose z into a major (tmp.y) and a minor (tmp.x)
-             * index.
-             */
-            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
-                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
-            bld.SHR(offset(tmp, bld, 1),
-                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
-                    offset(tile, bld, 2));
-
-            /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
-             * slice offset.
-             */
-            for (unsigned c = 0; c < 2; ++c) {
-               bld.MUL(offset(tmp, bld, c),
-                       offset(stride, bld, 2 + c), offset(tmp, bld, c));
-               bld.ADD(offset(addr, bld, c),
-                       offset(addr, bld, c), offset(tmp, bld, c));
-            }
-         }
-
-         if (dims > 1) {
-            /* Calculate the major/minor x and y indices.  In order to
-             * accommodate both X and Y tiling, the Y-major tiling format is
-             * treated as being a bunch of narrow X-tiles placed next to each
-             * other.  This means that the tile width for Y-tiling is actually
-             * the width of one sub-column of the Y-major tile where each 4K
-             * tile has 8 512B sub-columns.
-             *
-             * The major Y value is the row of tiles in which the pixel lives.
-             * The major X value is the tile sub-column in which the pixel
-             * lives; for X tiling, this is the same as the tile column, for Y
-             * tiling, each tile has 8 sub-columns.  The minor X and Y indices
-             * are the position within the sub-column.
-             */
-            for (unsigned c = 0; c < 2; ++c) {
-               /* Calculate the minor x and y indices. */
-               bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
-                       brw_imm_d(0), offset(addr, bld, c));
-
-               /* Calculate the major x and y indices. */
-               bld.SHR(offset(major, bld, c),
-                       offset(addr, bld, c), offset(tile, bld, c));
-            }
-
-            /* Calculate the texel index from the start of the tile row and
-             * the vertical coordinate of the row.
-             * Equivalent to:
-             *   tmp.x = (major.x << tile.y << tile.x) +
-             *           (minor.y << tile.x) + minor.x
-             *   tmp.y = major.y << tile.y
-             */
-            bld.SHL(tmp, major, offset(tile, bld, 1));
-            bld.ADD(tmp, tmp, offset(minor, bld, 1));
-            bld.SHL(tmp, tmp, offset(tile, bld, 0));
-            bld.ADD(tmp, tmp, minor);
-            bld.SHL(offset(tmp, bld, 1),
-                    offset(major, bld, 1), offset(tile, bld, 1));
-
-            /* Add it to the start of the tile row. */
-            bld.MUL(offset(tmp, bld, 1),
-                    offset(tmp, bld, 1), offset(stride, bld, 1));
-            bld.ADD(tmp, tmp, offset(tmp, bld, 1));
-
-            /* Multiply by the Bpp value. */
-            bld.MUL(dst, tmp, stride);
-
-            if (devinfo->gen < 8 && !devinfo->is_baytrail) {
-               /* Take into account the two dynamically specified shifts.
-                * Both need are used to implement swizzling of X-tiled
-                * surfaces.  For Y-tiled surfaces only one bit needs to be
-                * XOR-ed with bit 6 of the memory address, so a swz value of
-                * 0xff (actually interpreted as 31 by the hardware) will be
-                * provided to cause the relevant bit of tmp.y to be zero and
-                * turn the first XOR into the identity.  For linear surfaces
-                * or platforms lacking address swizzling both shifts will be
-                * 0xff causing the relevant bits of both tmp.x and .y to be
-                * zero, what effectively disables swizzling.
-                */
-               for (unsigned c = 0; c < 2; ++c)
-                  bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
-
-               /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
-               bld.XOR(tmp, tmp, offset(tmp, bld, 1));
-               bld.AND(tmp, tmp, brw_imm_d(1 << 6));
-               bld.XOR(dst, dst, tmp);
-            }
-
-         } else {
-            /* Multiply by the Bpp/stride value.  Note that the addr.y may be
-             * non-zero even if the image is one-dimensional because a
-             * vertical offset may have been applied above to select a
-             * non-zero slice or level of a higher-dimensional texture.
-             */
-            bld.MUL(offset(addr, bld, 1),
-                    offset(addr, bld, 1), offset(stride, bld, 1));
-            bld.ADD(addr, addr, offset(addr, bld, 1));
-            bld.MUL(dst, addr, stride);
-         }
-
-         return dst;
-      }
-   }
-
-   namespace image_format_conversion {
-      using image_format_info::color_u;
-
-      namespace {
-         /**
-          * Maximum representable value in an unsigned integer with the given
-          * number of bits.
-          */
-         inline unsigned
-         scale(unsigned n)
-         {
-            return (1 << n) - 1;
-         }
-      }
-
-      /**
-       * Pack the vector \p src in a bitfield given the per-component bit
-       * shifts and widths.  Note that bitfield components are not allowed to
-       * cross 32-bit boundaries.
-       */
-      static fs_reg
-      emit_pack(const fs_builder &bld, const fs_reg &src,
-                const color_u &shifts, const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
-         bool seen[4] = {};
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
-               /* Shift each component left to the correct bitfield position. */
-               bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
-
-               /* Add everything up. */
-               if (seen[shifts[c] / 32]) {
-                  bld.OR(offset(dst, bld, shifts[c] / 32),
-                         offset(dst, bld, shifts[c] / 32), tmp);
-               } else {
-                  bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
-                  seen[shifts[c] / 32] = true;
-               }
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Unpack a vector from the bitfield \p src given the per-component bit
-       * shifts and widths.  Note that bitfield components are not allowed to
-       * cross 32-bit boundaries.
-       */
-      static fs_reg
-      emit_unpack(const fs_builder &bld, const fs_reg &src,
-                  const color_u &shifts, const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(src.type, 4);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Shift left to discard the most significant bits. */
-               bld.SHL(offset(dst, bld, c),
-                       offset(src, bld, shifts[c] / 32),
-                       brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
-
-               /* Shift back to the least significant bits using an arithmetic
-                * shift to get sign extension on signed types.
-                */
-               bld.ASR(offset(dst, bld, c),
-                       offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Convert an integer vector into another integer vector of the
-       * specified bit widths, properly handling overflow.
-       */
-      static fs_reg
-      emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
-                              const color_u &widths, bool is_signed)
-      {
-         const unsigned s = (is_signed ? 1 : 0);
-         const fs_reg dst = bld.vgrf(
-            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
-         assert(src.type == dst.type);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Clamp to the maximum value. */
-               bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
-                               brw_imm_d((int)scale(widths[c] - s)),
-                               BRW_CONDITIONAL_L);
-
-               /* Clamp to the minimum value. */
-               if (is_signed)
-                  bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
-                                  brw_imm_d(-(int)scale(widths[c] - s) - 1),
-                                  BRW_CONDITIONAL_GE);
-
-               /* Mask off all but the bits we actually want.  Otherwise, if
-                * we pass a negative number into the hardware when it's
-                * expecting something like UINT8, it will happily clamp it to
-                * +255 for us.
-                */
-               if (is_signed && widths[c] < 32)
-                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
-                          brw_imm_d(scale(widths[c])));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Convert a normalized fixed-point vector of the specified signedness
-       * and bit widths into a floating point vector.
-       */
-      static fs_reg
-      emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
-                               const color_u &widths, bool is_signed)
-      {
-         const unsigned s = (is_signed ? 1 : 0);
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Convert to float. */
-               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
-
-               /* Divide by the normalization constants. */
-               bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
-                       brw_imm_f(1.0f / scale(widths[c] - s)));
-
-               /* Clamp to the minimum value. */
-               if (is_signed)
-                  bld.emit_minmax(offset(dst, bld, c),
-                                  offset(dst, bld, c), brw_imm_f(-1.0f),
-                                  BRW_CONDITIONAL_GE);
-            }
-         }
-         return dst;
-      }
-
-      /**
-       * Convert a floating-point vector into a normalized fixed-point vector
-       * of the specified signedness and bit widths.
-       */
-      static fs_reg
-      emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
-                             const color_u &widths, bool is_signed)
-      {
-         const unsigned s = (is_signed ? 1 : 0);
-         const fs_reg dst = bld.vgrf(
-            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
-         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               /* Clamp the normalized floating-point argument. */
-               if (is_signed) {
-                  bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
-                                  brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
-
-                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
-                                  brw_imm_f(1.0f), BRW_CONDITIONAL_L);
-               } else {
-                  set_saturate(true, bld.MOV(offset(fdst, bld, c),
-                                             offset(src, bld, c)));
-               }
-
-               /* Multiply by the normalization constants. */
-               bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
-                       brw_imm_f((float)scale(widths[c] - s)));
-
-               /* Convert to integer. */
-               bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
-               bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
-
-               /* Mask off all but the bits we actually want.  Otherwise, if
-                * we pass a negative number into the hardware when it's
-                * expecting something like UINT8, it will happily clamp it to
-                * +255 for us.
-                */
-               if (is_signed && widths[c] < 32)
-                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
-                          brw_imm_d(scale(widths[c])));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Convert a floating point vector of the specified bit widths into a
-       * 32-bit floating point vector.
-       */
-      static fs_reg
-      emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
-                              const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
-         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
-
-               /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
-                * This works because they have a 5-bit exponent just like the
-                * 16-bit floating point format, and they have no sign bit.
-                */
-               if (widths[c] < 16)
-                  bld.SHL(offset(dst, bld, c),
-                          offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
-
-               /* Convert to 32-bit floating point. */
-               bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
-            }
-         }
-
-         return fdst;
-      }
-
-      /**
-       * Convert a vector into a floating point vector of the specified bit
-       * widths.
-       */
-      static fs_reg
-      emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
-                            const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
-         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
-         for (unsigned c = 0; c < 4; ++c) {
-            if (widths[c]) {
-               bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
-
-               /* Clamp to the minimum value. */
-               if (widths[c] < 16)
-                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
-                                  brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
-
-               /* Convert to 16-bit floating-point. */
-               bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
-
-               /* Discard the least significant bits to get floating point
-                * numbers of the requested width.  This works because the
-                * 10-bit and 11-bit floating point formats have a 5-bit
-                * exponent just like the 16-bit format, and they have no sign
-                * bit.
-                */
-               if (widths[c] < 16)
-                  bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
-                          brw_imm_ud(15 - widths[c]));
-            }
-         }
-
-         return dst;
-      }
-
-      /**
-       * Fill missing components of a vector with 0, 0, 0, 1.
-       */
-      static fs_reg
-      emit_pad(const fs_builder &bld, const fs_reg &src,
-               const color_u &widths)
-      {
-         const fs_reg dst = bld.vgrf(src.type, 4);
-         const unsigned pad[] = { 0, 0, 0, 1 };
-
-         for (unsigned c = 0; c < 4; ++c)
-            bld.MOV(offset(dst, bld, c),
-                    widths[c] ? offset(src, bld, c)
-                              : fs_reg(brw_imm_ud(pad[c])));
-
-         return dst;
-      }
-   }
-}
-
-namespace brw {
-   namespace image_access {
-      /**
-       * Load a vector from a surface of the given format and dimensionality
-       * at the given coordinates.  \p surf_dims and \p arr_dims give the
-       * number of non-array and array coordinates of the image respectively.
-       */
-      fs_reg
-      emit_image_load(const fs_builder &bld,
-                      const fs_reg &image, const fs_reg &addr,
-                      unsigned surf_dims, unsigned arr_dims,
-                      unsigned gl_format)
-      {
-         using namespace image_format_info;
-         using namespace image_format_conversion;
-         using namespace image_validity;
-         using namespace image_coordinates;
-         using namespace surface_access;
-         const gen_device_info *devinfo = bld.shader->devinfo;
-         const isl_format format = isl_format_for_gl_format(gl_format);
-         const isl_format lower_format =
-            isl_lower_storage_image_format(devinfo, format);
-         fs_reg tmp;
-
-         /* Transform the image coordinates into actual surface coordinates. */
-         const fs_reg saddr =
-            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims, format);
-
-         if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
-            /* Hopefully we get here most of the time... */
-            tmp = emit_typed_read(bld, image, saddr, dims,
-                                  isl_format_get_num_channels(lower_format));
-         } else {
-            /* Untyped surface reads return 32 bits of the surface per
-             * component, without any sort of unpacking or type conversion,
-             */
-            const unsigned size = isl_format_get_layout(format)->bpb / 32;
-            /* they don't properly handle out of bounds access, so we have to
-             * check manually if the coordinates are valid and predicate the
-             * surface read on the result,
-             */
-            const brw_predicate pred =
-               emit_untyped_image_check(bld, image,
-                                        emit_bounds_check(bld, image,
-                                                          saddr, dims));
-
-            /* and they don't know about surface coordinates, we need to
-             * convert them to a raw memory offset.
-             */
-            const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
-
-            tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
-
-            /* An out of bounds surface access should give zero as result. */
-            for (unsigned c = 0; c < size; ++c)
-               set_predicate(pred, bld.SEL(offset(tmp, bld, c),
-                                           offset(tmp, bld, c), brw_imm_d(0)));
-         }
-
-         /* Set the register type to D instead of UD if the data type is
-          * represented as a signed integer in memory so that sign extension
-          * is handled correctly by unpack.
-          */
-         if (needs_sign_extension(format))
-            tmp = retype(tmp, BRW_REGISTER_TYPE_D);
-
-         if (!has_supported_bit_layout(devinfo, format)) {
-            /* Unpack individual vector components from the bitfield if the
-             * hardware is unable to do it for us.
-             */
-            if (has_split_bit_layout(devinfo, format))
-               tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
-                               get_bit_widths(lower_format));
-            else
-               tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
-                                 get_bit_widths(format));
-
-         } else if ((needs_sign_extension(format) &&
-                     !is_conversion_trivial(devinfo, format)) ||
-                    has_undefined_high_bits(devinfo, format)) {
-            /* Perform a trivial unpack even though the bit layout matches in
-             * order to get the most significant bits of each component
-             * initialized properly.
-             */
-            tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
-                              get_bit_widths(format));
-         }
-
-         if (!isl_format_has_int_channel(format)) {
-            if (is_conversion_trivial(devinfo, format)) {
-               /* Just need to cast the vector to the target type. */
-               tmp = retype(tmp, BRW_REGISTER_TYPE_F);
-            } else {
-               /* Do the right sort of type conversion to float. */
-               if (isl_format_has_float_channel(format))
-                  tmp = emit_convert_from_float(
-                     bld, tmp, get_bit_widths(format));
-               else
-                  tmp = emit_convert_from_scaled(
-                     bld, tmp, get_bit_widths(format),
-                     isl_format_has_snorm_channel(format));
-            }
-         }
-
-         /* Initialize missing components of the result. */
-         return emit_pad(bld, tmp, get_bit_widths(format));
-      }
-
-      /**
-       * Store a vector in a surface of the given format and dimensionality at
-       * the given coordinates.  \p surf_dims and \p arr_dims give the number
-       * of non-array and array coordinates of the image respectively.
-       */
-      void
-      emit_image_store(const fs_builder &bld, const fs_reg &image,
-                       const fs_reg &addr, const fs_reg &src,
-                       unsigned surf_dims, unsigned arr_dims,
-                       unsigned gl_format)
-      {
-         using namespace image_format_info;
-         using namespace image_format_conversion;
-         using namespace image_validity;
-         using namespace image_coordinates;
-         using namespace surface_access;
-         const isl_format format = isl_format_for_gl_format(gl_format);
-         const gen_device_info *devinfo = bld.shader->devinfo;
-
-         /* Transform the image coordinates into actual surface coordinates. */
-         const fs_reg saddr =
-            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims, format);
-
-         if (gl_format == GL_NONE) {
-            /* We don't know what the format is, but that's fine because it
-             * implies write-only access, and typed surface writes are always
-             * able to take care of type conversion and packing for us.
-             */
-            emit_typed_write(bld, image, saddr, src, dims, 4);
-
-         } else {
-            const isl_format lower_format =
-               isl_lower_storage_image_format(devinfo, format);
-            fs_reg tmp = src;
-
-            if (!is_conversion_trivial(devinfo, format)) {
-               /* Do the right sort of type conversion. */
-               if (isl_format_has_float_channel(format))
-                  tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
-
-               else if (isl_format_has_int_channel(format))
-                  tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
-                                                isl_format_has_sint_channel(format));
-
-               else
-                  tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
-                                               isl_format_has_snorm_channel(format));
-            }
-
-            /* We're down to bit manipulation at this point. */
-            tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
-
-            if (!has_supported_bit_layout(devinfo, format)) {
-               /* Pack the vector components into a bitfield if the hardware
-                * is unable to do it for us.
-                */
-               if (has_split_bit_layout(devinfo, format))
-                  tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
-                                    get_bit_widths(lower_format));
-
-               else
-                  tmp = emit_pack(bld, tmp, get_bit_shifts(format),
-                                  get_bit_widths(format));
-            }
-
-            if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
-               /* Hopefully we get here most of the time... */
-               emit_typed_write(bld, image, saddr, tmp, dims,
-                                isl_format_get_num_channels(lower_format));
-
-            } else {
-               /* Untyped surface writes store 32 bits of the surface per
-                * component, without any sort of packing or type conversion,
-                */
-               const unsigned size = isl_format_get_layout(format)->bpb / 32;
-
-               /* they don't properly handle out of bounds access, so we have
-                * to check manually if the coordinates are valid and predicate
-                * the surface write on the result,
-                */
-               const brw_predicate pred =
-                  emit_untyped_image_check(bld, image,
-                                           emit_bounds_check(bld, image,
-                                                             saddr, dims));
-
-               /* and, phew, they don't know about surface coordinates, we
-                * need to convert them to a raw memory offset.
-                */
-               const fs_reg laddr = emit_address_calculation(
-                  bld, image, saddr, dims);
-
-               emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
-            }
-         }
-      }
-
-      /**
-       * Perform an atomic read-modify-write operation in a surface of the
-       * given dimensionality at the given coordinates.  \p surf_dims and \p
-       * arr_dims give the number of non-array and array coordinates of the
-       * image respectively.  Main building block of the imageAtomic GLSL
-       * built-ins.
-       */
-      fs_reg
-      emit_image_atomic(const fs_builder &bld,
-                        const fs_reg &image, const fs_reg &addr,
-                        const fs_reg &src0, const fs_reg &src1,
-                        unsigned surf_dims, unsigned arr_dims,
-                        unsigned rsize, unsigned op)
-      {
-         using namespace image_validity;
-         using namespace image_coordinates;
-         using namespace surface_access;
-         /* Avoid performing an atomic operation on an unbound surface. */
-         const brw_predicate pred = emit_typed_atomic_check(bld, image);
-
-         /* Transform the image coordinates into actual surface coordinates. */
-         const fs_reg saddr =
-            emit_image_coordinates(bld, addr, surf_dims, arr_dims,
-                                   ISL_FORMAT_R32_UINT);
-         const unsigned dims =
-            num_image_coordinates(bld, surf_dims, arr_dims,
-                                  ISL_FORMAT_R32_UINT);
-
-         /* Thankfully we can do without untyped atomics here. */
-         const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
-                                              dims, rsize, op, pred);
-
-         /* An unbound surface access should give zero as result. */
-         if (rsize && pred)
-            set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
-
-         return retype(tmp, src0.type);
-      }
-   }
-}
diff --git a/src/intel/compiler/brw_fs_surface_builder.h b/src/intel/compiler/brw_fs_surface_builder.h

index 6952df64286f8d4a77c21c5fb54a2e3b8f6d6381..8df5d25f4fab0f64af34da55065ee23190a576bd 100644 (file)
--- a/src/intel/compiler/brw_fs_surface_builder.h
+++ b/src/intel/compiler/brw_fs_surface_builder.h
@@ -85,25 +85,5 @@ namespace brw {
                                  unsigned bit_size,
                                  brw_predicate pred = BRW_PREDICATE_NONE);
     }
-
-   namespace image_access {
-      fs_reg
-      emit_image_load(const fs_builder &bld,
-                      const fs_reg &image, const fs_reg &addr,
-                      unsigned surf_dims, unsigned arr_dims,
-                      unsigned gl_format);
-
-      void
-      emit_image_store(const fs_builder &bld, const fs_reg &image,
-                       const fs_reg &addr, const fs_reg &src,
-                       unsigned surf_dims, unsigned arr_dims,
-                       unsigned gl_format);
-      fs_reg
-      emit_image_atomic(const fs_builder &bld,
-                        const fs_reg &image, const fs_reg &addr,
-                        const fs_reg &src0, const fs_reg &src1,
-                        unsigned surf_dims, unsigned arr_dims,
-                        unsigned rsize, unsigned op);
-   }
  }
  #endif
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h

index 5c75ef2324a2425aefed5d3bc2806b50f1cc0039..72a6ee8884abecae7dc770f79de89f4f3c2d6e3b 100644 (file)
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -114,6 +114,9 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue,
                                 GLenum tes_primitive_mode);
  void brw_nir_lower_fs_outputs(nir_shader *nir);
  
+bool brw_nir_lower_image_load_store(nir_shader *nir,
+                                    const struct gen_device_info *devinfo);
+
  nir_shader *brw_postprocess_nir(nir_shader *nir,
                                  const struct brw_compiler *compiler,
                                  bool is_scalar);
diff --git a/src/intel/compiler/brw_nir_lower_image_load_store.c b/src/intel/compiler/brw_nir_lower_image_load_store.c

new file mode 100644 (file)

index 0000000..b931a6d
--- /dev/null
+++ b/src/intel/compiler/brw_nir_lower_image_load_store.c
@@ -0,0 +1,822 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl/isl.h"
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+/* The higher compiler layers use the GL enums for image formats even if
+ * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
+ * enum before we can use them.
+ */
+static enum isl_format
+isl_format_for_gl_format(uint32_t gl_format)
+{
+   switch (gl_format) {
+   case GL_R8:             return ISL_FORMAT_R8_UNORM;
+   case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
+   case GL_R8UI:           return ISL_FORMAT_R8_UINT;
+   case GL_R8I:            return ISL_FORMAT_R8_SINT;
+   case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
+   case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
+   case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
+   case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
+   case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
+   case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
+   case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
+   case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
+   case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
+   case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
+   case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
+   case GL_R16:            return ISL_FORMAT_R16_UNORM;
+   case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
+   case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
+   case GL_R16UI:          return ISL_FORMAT_R16_UINT;
+   case GL_R16I:           return ISL_FORMAT_R16_SINT;
+   case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
+   case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
+   case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
+   case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
+   case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
+   case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
+   case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
+   case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
+   case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
+   case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
+   case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
+   case GL_R32UI:          return ISL_FORMAT_R32_UINT;
+   case GL_R32I:           return ISL_FORMAT_R32_SINT;
+   case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
+   case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
+   case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
+   case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
+   case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
+   case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
+   case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
+   default:
+      assert(!"Invalid image format");
+      return ISL_FORMAT_UNSUPPORTED;
+   }
+}
+
+static nir_ssa_def *
+_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader,
+                                 nir_intrinsic_image_deref_load_param_intel);
+   load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+   nir_intrinsic_set_base(load, offset / 4);
+
+   switch (offset) {
+   case BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET:
+      load->num_components = 1;
+      break;
+   case BRW_IMAGE_PARAM_OFFSET_OFFSET:
+   case BRW_IMAGE_PARAM_SWIZZLING_OFFSET:
+      load->num_components = 2;
+      break;
+   case BRW_IMAGE_PARAM_TILING_OFFSET:
+   case BRW_IMAGE_PARAM_SIZE_OFFSET:
+      load->num_components = 3;
+      break;
+   case BRW_IMAGE_PARAM_STRIDE_OFFSET:
+      load->num_components = 4;
+      break;
+   default:
+      unreachable("Invalid param offset");
+   }
+   nir_ssa_dest_init(&load->instr, &load->dest,
+                     load->num_components, 32, NULL);
+
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->dest.ssa;
+}
+
+#define load_image_param(b, d, o) \
+   _load_image_param(b, d, BRW_IMAGE_PARAM_##o##_OFFSET)
+
+static nir_ssa_def *
+sanitize_image_coord(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *coord)
+{
+   if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
+       glsl_sampler_type_is_array(deref->type)) {
+      /* It's easier if 1D arrays are treated like 2D arrays */
+      return nir_vec3(b, nir_channel(b, coord, 0),
+                         nir_imm_int(b, 0),
+                         nir_channel(b, coord, 1));
+   } else {
+      unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
+      return nir_channels(b, coord, (1 << dims) - 1);
+   }
+}
+
+static nir_ssa_def *
+image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
+                         nir_ssa_def *coord)
+{
+   coord = sanitize_image_coord(b, deref, coord);
+   nir_ssa_def *size = load_image_param(b, deref, SIZE);
+
+   nir_ssa_def *cmp = nir_ilt(b, coord, size);
+   nir_ssa_def *in_bounds = nir_imm_int(b, NIR_TRUE);
+   for (unsigned i = 0; i < coord->num_components; i++)
+      in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
+
+   return in_bounds;
+}
+
+/** Calculate the offset in memory of the texel given by \p coord.
+ *
+ * This is meant to be used with untyped surface messages to access a tiled
+ * surface, what involves taking into account the tiling and swizzling modes
+ * of the surface manually so it will hopefully not happen very often.
+ *
+ * The tiling algorithm implemented here matches either the X or Y tiling
+ * layouts supported by the hardware depending on the tiling coefficients
+ * passed to the program as uniforms.  See Volume 1 Part 2 Section 4.5
+ * "Address Tiling Function" of the IVB PRM for an in-depth explanation of
+ * the hardware tiling format.
+ */
+static nir_ssa_def *
+image_address(nir_builder *b, const struct gen_device_info *devinfo,
+              nir_deref_instr *deref, nir_ssa_def *coord)
+{
+   coord = sanitize_image_coord(b, deref, coord);
+
+   nir_ssa_def *offset = load_image_param(b, deref, OFFSET);
+   nir_ssa_def *tiling = load_image_param(b, deref, TILING);
+   nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
+
+   /* Shift the coordinates by the fixed surface offset.  It may be non-zero
+    * if the image is a single slice of a higher-dimensional surface, or if a
+    * non-zero mipmap level of the surface is bound to the pipeline.  The
+    * offset needs to be applied here rather than at surface state set-up time
+    * because the desired slice-level may start mid-tile, so simply shifting
+    * the surface base address wouldn't give a well-formed tiled surface in
+    * the general case.
+    */
+   nir_ssa_def *xypos = (coord->num_components == 1) ?
+                        nir_vec2(b, coord, nir_imm_int(b, 0)) :
+                        nir_channels(b, coord, 0x3);
+   xypos = nir_iadd(b, xypos, offset);
+
+   /* The layout of 3-D textures in memory is sort-of like a tiling
+    * format.  At each miplevel, the slices are arranged in rows of
+    * 2^level slices per row.  The slice row is stored in tmp.y and
+    * the slice within the row is stored in tmp.x.
+    *
+    * The layout of 2-D array textures and cubemaps is much simpler:
+    * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+    * stored in memory as an array of slices, each one being a 2-D
+    * arrangement of miplevels, or as a 2D arrangement of miplevels,
+    * each one being an array of slices.  In either case the separation
+    * between slices of the same LOD is equal to the qpitch value
+    * provided as stride.w.
+    *
+    * This code can be made to handle either 2D arrays and 3D textures
+    * by passing in the miplevel as tile.z for 3-D textures and 0 in
+    * tile.z for 2-D array textures.
+    *
+    * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+    * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+    * of the hardware 3D texture and 2D array layouts.
+    */
+   if (coord->num_components > 2) {
+      /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+       * index.
+       */
+      nir_ssa_def *z = nir_channel(b, coord, 2);
+      nir_ssa_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
+                                  nir_channel(b, tiling, 2));
+      nir_ssa_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
+
+      /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+       * slice offset.
+       */
+      xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
+                                             nir_channels(b, stride, 0xc)));
+   }
+
+   nir_ssa_def *addr;
+   if (coord->num_components > 1) {
+      /* Calculate the major/minor x and y indices.  In order to
+       * accommodate both X and Y tiling, the Y-major tiling format is
+       * treated as being a bunch of narrow X-tiles placed next to each
+       * other.  This means that the tile width for Y-tiling is actually
+       * the width of one sub-column of the Y-major tile where each 4K
+       * tile has 8 512B sub-columns.
+       *
+       * The major Y value is the row of tiles in which the pixel lives.
+       * The major X value is the tile sub-column in which the pixel
+       * lives; for X tiling, this is the same as the tile column, for Y
+       * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+       * are the position within the sub-column.
+       */
+
+      /* Calculate the minor x and y indices. */
+      nir_ssa_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
+                                       nir_channels(b, tiling, 0x3));
+      nir_ssa_def *major = nir_ushr(b, xypos, nir_channels(b, tiling, 0x3));
+
+      /* Calculate the texel index from the start of the tile row and the
+       * vertical coordinate of the row.
+       * Equivalent to:
+       *   tmp.x = (major.x << tile.y << tile.x) +
+       *           (minor.y << tile.x) + minor.x
+       *   tmp.y = major.y << tile.y
+       */
+      nir_ssa_def *idx_x, *idx_y;
+      idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
+      idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
+      idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
+      idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
+
+      /* Add it to the start of the tile row. */
+      nir_ssa_def *idx;
+      idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
+      idx = nir_iadd(b, idx, idx_x);
+
+      /* Multiply by the Bpp value. */
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+
+      if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+         /* Take into account the two dynamically specified shifts.  Both are
+          * used to implement swizzling of X-tiled surfaces.  For Y-tiled
+          * surfaces only one bit needs to be XOR-ed with bit 6 of the memory
+          * address, so a swz value of 0xff (actually interpreted as 31 by the
+          * hardware) will be provided to cause the relevant bit of tmp.y to
+          * be zero and turn the first XOR into the identity.  For linear
+          * surfaces or platforms lacking address swizzling both shifts will
+          * be 0xff causing the relevant bits of both tmp.x and .y to be zero,
+          * what effectively disables swizzling.
+          */
+         nir_ssa_def *swizzle = load_image_param(b, deref, SWIZZLING);
+         nir_ssa_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
+         nir_ssa_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
+
+         /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+         nir_ssa_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
+                                        nir_imm_int(b, 1 << 6));
+         addr = nir_ixor(b, addr, bit);
+      }
+   } else {
+      /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+       * non-zero even if the image is one-dimensional because a vertical
+       * offset may have been applied above to select a non-zero slice or
+       * level of a higher-dimensional texture.
+       */
+      nir_ssa_def *idx;
+      idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
+      idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
+      addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+   }
+
+   return addr;
+}
+
+struct format_info {
+   const struct isl_format_layout *fmtl;
+   unsigned chans;
+   unsigned bits[4];
+};
+
+static struct format_info
+get_format_info(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return (struct format_info) {
+      .fmtl = fmtl,
+      .chans = isl_format_get_num_channels(fmt),
+      .bits = {
+         fmtl->channels.r.bits,
+         fmtl->channels.g.bits,
+         fmtl->channels.b.bits,
+         fmtl->channels.a.bits
+      },
+   };
+}
+
+static nir_ssa_def *
+nir_zero_vec(nir_builder *b, unsigned num_components)
+{
+   nir_const_value v;
+   memset(&v, 0, sizeof(v));
+
+   return nir_build_imm(b, num_components, 32, v);
+}
+
+static nir_ssa_def *
+convert_color_for_load(nir_builder *b, const struct gen_device_info *devinfo,
+                       nir_ssa_def *color,
+                       enum isl_format image_fmt, enum isl_format lower_fmt,
+                       unsigned dest_components)
+{
+   if (image_fmt == lower_fmt)
+      goto expand_vec;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      color = nir_format_unpack_11f11f10f(b, color);
+      goto expand_vec;
+   }
+
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   const bool needs_sign_extension =
+      isl_format_has_snorm_channel(image_fmt) ||
+      isl_format_has_sint_channel(image_fmt);
+
+   /* We only check the red channel to detect if we need to pack/unpack */
+   assert(image.bits[0] != lower.bits[0] ||
+          memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      if (needs_sign_extension)
+         color = nir_format_unpack_sint(b, color, image.bits, image.chans);
+      else
+         color = nir_format_unpack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      /* On IVB, we rely on the undocumented behavior that typed reads from
+       * surfaces of the unsupported R8 and R16 formats return useful data in
+       * their least significant bits.  However, the data in the high bits is
+       * garbage so we have to discard it.
+       */
+      if (devinfo->gen == 7 && !devinfo->is_haswell &&
+          (lower_fmt == ISL_FORMAT_R16_UINT ||
+           lower_fmt == ISL_FORMAT_R8_UINT))
+         color = nir_format_mask_uvec(b, color, lower.bits);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
+                                                  image.bits[0]);
+      }
+
+      if (needs_sign_extension)
+         color = nir_format_sign_extend_ivec(b, color, image.bits);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_unorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_snorm_to_float(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16)
+         color = nir_unpack_half_2x16_split_x(b, color);
+      break;
+
+   case ISL_UINT:
+   case ISL_SINT:
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+expand_vec:
+   assert(dest_components == 1 || dest_components == 4);
+   assert(color->num_components <= dest_components);
+   if (color->num_components == dest_components)
+      return color;
+
+   nir_ssa_def *comps[4];
+   for (unsigned i = 0; i < color->num_components; i++)
+      comps[i] = nir_channel(b, color, i);
+
+   for (unsigned i = color->num_components; i < 3; i++)
+      comps[i] = nir_imm_int(b, 0);
+
+   if (color->num_components < 4) {
+      if (isl_format_has_int_channel(image_fmt))
+         comps[3] = nir_imm_int(b, 1);
+      else
+         comps[3] = nir_imm_float(b, 1);
+   }
+
+   return nir_vec(b, comps, dest_components);
+}
+
+static bool
+lower_image_load_instr(nir_builder *b,
+                       const struct gen_device_info *devinfo,
+                       nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   const enum isl_format image_fmt =
+      isl_format_for_gl_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+      const unsigned dest_components = intrin->num_components;
+
+      /* Use an undef to hold the uses of the load while we do the color
+       * conversion.
+       */
+      nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
+
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      intrin->dest.ssa.num_components = intrin->num_components;
+
+      b->cursor = nir_after_instr(&intrin->instr);
+
+      nir_ssa_def *color = convert_color_for_load(b, devinfo,
+                                                  &intrin->dest.ssa,
+                                                  image_fmt, lower_fmt,
+                                                  dest_components);
+
+      nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(color));
+      nir_instr_remove(placeholder->parent_instr);
+   } else {
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+      const unsigned dest_components = intrin->num_components;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_ssa_def *coord = intrin->src[1].ssa;
+
+      nir_ssa_def *do_load = image_coord_is_in_bounds(b, deref, coord);
+      if (devinfo->gen == 7 && !devinfo->is_haswell) {
+         /* Check whether the first stride component (i.e. the Bpp value)
+          * is greater than four, what on Gen7 indicates that a surface of
+          * type RAW has been bound for untyped access.  Reading or writing
+          * to a surface of type other than RAW using untyped surface
+          * messages causes a hang on IVB and VLV.
+          */
+         nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
+         nir_ssa_def *is_raw =
+            nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
+         do_load = nir_iand(b, do_load, is_raw);
+      }
+      nir_push_if(b, do_load);
+
+      nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
+      nir_intrinsic_instr *load =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_image_deref_load_raw_intel);
+      load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+      load->src[1] = nir_src_for_ssa(addr);
+      load->num_components = image_fmtl->bpb / 32;
+      nir_ssa_dest_init(&load->instr, &load->dest,
+                        load->num_components, 32, NULL);
+      nir_builder_instr_insert(b, &load->instr);
+
+      nir_push_else(b, NULL);
+
+      nir_ssa_def *zero = nir_zero_vec(b, load->num_components);
+
+      nir_pop_if(b, NULL);
+
+      nir_ssa_def *value = nir_if_phi(b, &load->dest.ssa, zero);
+
+      nir_ssa_def *color = convert_color_for_load(b, devinfo, value,
+                                                  image_fmt, raw_fmt,
+                                                  dest_components);
+
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(color));
+   }
+
+   return true;
+}
+
+static nir_ssa_def *
+convert_color_for_store(nir_builder *b, const struct gen_device_info *devinfo,
+                        nir_ssa_def *color,
+                        enum isl_format image_fmt, enum isl_format lower_fmt)
+{
+   struct format_info image = get_format_info(image_fmt);
+   struct format_info lower = get_format_info(lower_fmt);
+
+   color = nir_channels(b, color, (1 << image.chans) - 1);
+
+   if (image_fmt == lower_fmt)
+      return color;
+
+   if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+      assert(lower_fmt == ISL_FORMAT_R32_UINT);
+      return nir_format_pack_11f11f10f(b, color);
+   }
+
+   switch (image.fmtl->channels.r.type) {
+   case ISL_UNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_unorm(b, color, image.bits);
+      break;
+
+   case ISL_SNORM:
+      assert(isl_format_has_uint_channel(lower_fmt));
+      color = nir_format_float_to_snorm(b, color, image.bits);
+      break;
+
+   case ISL_SFLOAT:
+      if (image.bits[0] == 16) {
+         nir_ssa_def *f16comps[4];
+         for (unsigned i = 0; i < image.chans; i++) {
+            f16comps[i] = nir_pack_half_2x16_split(b, nir_channel(b, color, i),
+                                                      nir_imm_float(b, 0));
+         }
+         color = nir_vec(b, f16comps, image.chans);
+      }
+      break;
+
+   case ISL_UINT:
+      if (image.bits[0] < 32) {
+         nir_const_value max;
+         for (unsigned i = 0; i < image.chans; i++) {
+            assert(image.bits[i] < 32);
+            max.u32[i] = (1u << image.bits[i]) - 1;
+         }
+         color = nir_umin(b, color, nir_build_imm(b, image.chans, 32, max));
+      }
+      break;
+
+   case ISL_SINT:
+      if (image.bits[0] < 32) {
+         nir_const_value min, max;
+         for (unsigned i = 0; i < image.chans; i++) {
+            assert(image.bits[i] < 32);
+            max.i32[i] = (1 << (image.bits[i] - 1)) - 1;
+            min.i32[i] = -(1 << (image.bits[i] - 1));
+         }
+         color = nir_imin(b, color, nir_build_imm(b, image.chans, 32, max));
+         color = nir_imax(b, color, nir_build_imm(b, image.chans, 32, min));
+      }
+      break;
+
+   default:
+      unreachable("Invalid image channel type");
+   }
+
+   if (image.bits[0] < 32 &&
+       (isl_format_has_snorm_channel(image_fmt) ||
+        isl_format_has_sint_channel(image_fmt)))
+      color = nir_format_mask_uvec(b, color, image.bits);
+
+   if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+      color = nir_format_pack_uint(b, color, image.bits, image.chans);
+   } else {
+      /* All these formats are homogeneous */
+      for (unsigned i = 1; i < image.chans; i++)
+         assert(image.bits[i] == image.bits[0]);
+
+      if (image.bits[0] != lower.bits[0]) {
+         color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
+                                                  lower.bits[0]);
+      }
+   }
+
+   return color;
+}
+
+static bool
+lower_image_store_instr(nir_builder *b,
+                        const struct gen_device_info *devinfo,
+                        nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   /* For write-only surfaces, we trust that the hardware can just do the
+    * conversion for us.
+    */
+   if (var->data.image.write_only)
+      return false;
+
+   const enum isl_format image_fmt =
+      isl_format_for_gl_format(var->data.image.format);
+
+   if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+      const enum isl_format lower_fmt =
+         isl_lower_storage_image_format(devinfo, image_fmt);
+
+      /* Color conversion goes before the store */
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      nir_ssa_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, lower_fmt);
+      intrin->num_components = isl_format_get_num_channels(lower_fmt);
+      nir_instr_rewrite_src(&intrin->instr, &intrin->src[3],
+                            nir_src_for_ssa(color));
+   } else {
+      const struct isl_format_layout *image_fmtl =
+         isl_format_get_layout(image_fmt);
+      /* We have a matching typed format for everything 32b and below */
+      assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+      enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+                                ISL_FORMAT_R32G32_UINT :
+                                ISL_FORMAT_R32G32B32A32_UINT;
+
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_ssa_def *coord = intrin->src[1].ssa;
+
+      nir_ssa_def *do_store = image_coord_is_in_bounds(b, deref, coord);
+      if (devinfo->gen == 7 && !devinfo->is_haswell) {
+         /* Check whether the first stride component (i.e. the Bpp value)
+          * is greater than four, what on Gen7 indicates that a surface of
+          * type RAW has been bound for untyped access.  Reading or writing
+          * to a surface of type other than RAW using untyped surface
+          * messages causes a hang on IVB and VLV.
+          */
+         nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
+         nir_ssa_def *is_raw =
+            nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
+         do_store = nir_iand(b, do_store, is_raw);
+      }
+      nir_push_if(b, do_store);
+
+      nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
+      nir_ssa_def *color = convert_color_for_store(b, devinfo,
+                                                   intrin->src[3].ssa,
+                                                   image_fmt, raw_fmt);
+
+      nir_intrinsic_instr *store =
+         nir_intrinsic_instr_create(b->shader,
+                                    nir_intrinsic_image_deref_store_raw_intel);
+      store->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+      store->src[1] = nir_src_for_ssa(addr);
+      store->src[2] = nir_src_for_ssa(color);
+      store->num_components = image_fmtl->bpb / 32;
+      nir_builder_instr_insert(b, &store->instr);
+
+      nir_pop_if(b, NULL);
+   }
+
+   return true;
+}
+
+static bool
+lower_image_atomic_instr(nir_builder *b,
+                         const struct gen_device_info *devinfo,
+                         nir_intrinsic_instr *intrin)
+{
+   if (devinfo->is_haswell || devinfo->gen >= 8)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   /* Use an undef to hold the uses of the load conversion. */
+   nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
+
+   /* Check the first component of the size field to find out if the
+    * image is bound.  Necessary on IVB for typed atomics because
+    * they don't seem to respect null surfaces and will happily
+    * corrupt or read random memory when no image is bound.
+    */
+   nir_ssa_def *size = load_image_param(b, deref, SIZE);
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+   nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
+
+   nir_builder_instr_insert(b, &intrin->instr);
+
+   nir_pop_if(b, NULL);
+
+   nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, zero);
+   nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(result));
+
+   return true;
+}
+
+static bool
+lower_image_size_instr(nir_builder *b,
+                       const struct gen_device_info *devinfo,
+                       nir_intrinsic_instr *intrin)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_ssa_def *size = load_image_param(b, deref, SIZE);
+
+   nir_ssa_def *comps[4] = { NULL, NULL, NULL, NULL };
+
+   enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type);
+   unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
+   for (unsigned c = 0; c < coord_comps; c++) {
+      if (c == 1 && dim == GLSL_SAMPLER_DIM_1D) {
+         /* The array length for 1D arrays is in .z */
+         comps[1] = nir_channel(b, size, 2);
+      } else if (c == 2 && dim == GLSL_SAMPLER_DIM_CUBE) {
+         comps[2] = nir_idiv(b, nir_channel(b, size, 2), nir_imm_int(b, 6));
+      } else {
+         comps[c] = nir_channel(b, size, c);
+      }
+   }
+
+   for (unsigned c = coord_comps; c < intrin->dest.ssa.num_components; ++c)
+      comps[c] = nir_imm_int(b, 1);
+
+   nir_ssa_def *vec = nir_vec(b, comps, intrin->dest.ssa.num_components);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(vec));
+
+   return true;
+}
+
+bool
+brw_nir_lower_image_load_store(nir_shader *shader,
+                               const struct gen_device_info *devinfo)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl == NULL)
+         continue;
+
+      nir_foreach_block_safe(block, function->impl) {
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_image_deref_load:
+               if (lower_image_load_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            case nir_intrinsic_image_deref_store:
+               if (lower_image_store_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            case nir_intrinsic_image_deref_atomic_add:
+            case nir_intrinsic_image_deref_atomic_min:
+            case nir_intrinsic_image_deref_atomic_max:
+            case nir_intrinsic_image_deref_atomic_and:
+            case nir_intrinsic_image_deref_atomic_or:
+            case nir_intrinsic_image_deref_atomic_xor:
+            case nir_intrinsic_image_deref_atomic_exchange:
+            case nir_intrinsic_image_deref_atomic_comp_swap:
+               if (lower_image_atomic_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            case nir_intrinsic_image_deref_size:
+               if (lower_image_size_instr(&b, devinfo, intrin))
+                  progress = true;
+               break;
+
+            default:
+               /* Nothing to do */
+               break;
+            }
+         }
+      }
+
+      nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                            nir_metadata_dominance);
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build

index 98860c943742b8f7ce5b3614c285dc7df52bbdf5..3cdeb6214a88f97405069f85480580c5f5ec33cb 100644 (file)
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -77,6 +77,7 @@ libintel_compiler_files = files(
    'brw_nir_analyze_ubo_ranges.c',
    'brw_nir_attribute_workarounds.c',
    'brw_nir_lower_cs_intrinsics.c',
+  'brw_nir_lower_image_load_store.c',
    'brw_nir_opt_peephole_ffma.c',
    'brw_nir_tcs_workarounds.c',
    'brw_packed_float.c',
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c

index 0fe0c7e296e79fe98ea107e517a537dfda299a37..19d59b7fbac7fb1749e09c1e36795f27d7983ace 100644 (file)
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -532,6 +532,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
     if (nir->info.stage != MESA_SHADER_COMPUTE)
        brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
  
+   NIR_PASS_V(nir, brw_nir_lower_image_load_store, compiler->devinfo);
+
     assert(nir->num_uniforms == prog_data->nr_params * 4);
  
     stage->nir = nir;
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c

index a669814d0d28eb79c3eac34a7c4a3c4022716e09..f5ebd3c3b059de25dba9b978c7bf46f479580e50 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -102,6 +102,8 @@ brw_create_nir(struct brw_context *brw,
  
     nir = brw_preprocess_nir(brw->screen->compiler, nir);
  
+   NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
+
     if (stage == MESA_SHADER_TESS_CTRL) {
        /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
        static const gl_state_index16 tokens[STATE_LENGTH] =
author	Jason Ekstrand <jason.ekstrand@intel.com>
	Sat, 27 Jan 2018 21:19:57 +0000 (13:19 -0800)
committer	Jason Ekstrand <jason.ekstrand@intel.com>
	Wed, 29 Aug 2018 19:04:02 +0000 (14:04 -0500)
src/compiler/nir/nir_intrinsics.py		patch \| blob \| history
src/intel/Makefile.sources		patch \| blob \| history
src/intel/compiler/brw_fs_nir.cpp		patch \| blob \| history
src/intel/compiler/brw_fs_surface_builder.cpp		patch \| blob \| history
src/intel/compiler/brw_fs_surface_builder.h		patch \| blob \| history
src/intel/compiler/brw_nir.h		patch \| blob \| history
src/intel/compiler/brw_nir_lower_image_load_store.c	[new file with mode: 0644]	patch \| blob
src/intel/compiler/meson.build		patch \| blob \| history
src/intel/vulkan/anv_pipeline.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_program.c		patch \| blob \| history