intrinsic("image_deref_size", src_comp=[1], dest_comp=0, flags=[CAN_ELIMINATE, CAN_REORDER])
intrinsic("image_deref_samples", src_comp=[1], dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER])
+# Intel-specific query for loading from the brw_image_param struct passed
+# into the shader as a uniform. The variable is a deref to the image
+# variable. The const index specifies which of the six parameters to load.
+intrinsic("image_deref_load_param_intel", src_comp=[1], dest_comp=0,
+ indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("image_deref_load_raw_intel", src_comp=[1, 1], dest_comp=0,
+ flags=[CAN_ELIMINATE])
+intrinsic("image_deref_store_raw_intel", src_comp=[1, 1, 0])
+
# Vulkan descriptor set intrinsics
#
# The Vulkan API uses a different binding model from GL. In the Vulkan
compiler/brw_nir_analyze_ubo_ranges.c \
compiler/brw_nir_attribute_workarounds.c \
compiler/brw_nir_lower_cs_intrinsics.c \
+ compiler/brw_nir_lower_image_load_store.c \
compiler/brw_nir_opt_peephole_ffma.c \
compiler/brw_nir_tcs_workarounds.c \
compiler/brw_packed_float.c \
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap: {
- using namespace image_access;
-
if (stage == MESA_SHADER_FRAGMENT &&
instr->intrinsic != nir_intrinsic_image_deref_load)
brw_wm_prog_data(prog_data)->has_side_effects = true;
/* Get the referenced image variable and type. */
nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
- const nir_variable *var = nir_deref_instr_get_variable(deref);
- const glsl_type *type = var->type->without_array();
- const brw_reg_type base_type = get_image_base_type(type);
+ const glsl_type *type = deref->type;
/* Get some metadata from the image intrinsic. */
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
- const unsigned arr_dims = type->sampler_array ? 1 : 0;
- const unsigned surf_dims = type->coordinate_components() - arr_dims;
- const unsigned format = var->data.image.format;
+ const unsigned dims = type->coordinate_components();
const unsigned dest_components = nir_intrinsic_dest_components(instr);
/* Get the arguments of the image intrinsic. */
const fs_reg image = get_nir_image_deref(deref);
- const fs_reg addr = retype(get_nir_src(instr->src[1]),
- BRW_REGISTER_TYPE_UD);
+ const fs_reg coords = retype(get_nir_src(instr->src[1]),
+ BRW_REGISTER_TYPE_UD);
fs_reg tmp;
/* Emit an image load, store or atomic op. */
- if (instr->intrinsic == nir_intrinsic_image_deref_load)
- tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
- else if (instr->intrinsic == nir_intrinsic_image_deref_store) {
- const fs_reg src0 = retype(get_nir_src(instr->src[3]), base_type);
- emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
- var->data.image.write_only ? GL_NONE : format);
+ if (instr->intrinsic == nir_intrinsic_image_deref_load) {
+ tmp = emit_typed_read(bld, image, coords, dims,
+ instr->num_components);
+ } else if (instr->intrinsic == nir_intrinsic_image_deref_store) {
+ const fs_reg src0 = get_nir_src(instr->src[3]);
+ emit_typed_write(bld, image, coords, src0, dims,
+ instr->num_components);
} else {
int op;
unsigned num_srcs = info->num_srcs;
}
const fs_reg src0 = (num_srcs >= 4 ?
- retype(get_nir_src(instr->src[3]), base_type) :
- fs_reg());
+ get_nir_src(instr->src[3]) : fs_reg());
const fs_reg src1 = (num_srcs >= 5 ?
- retype(get_nir_src(instr->src[4]), base_type) :
- fs_reg());
+ get_nir_src(instr->src[4]) : fs_reg());
- tmp = emit_image_atomic(bld, image, addr, src0, src1,
- surf_dims, arr_dims, dest_components,
- op);
+ tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1, op);
}
/* Assign the result. */
for (unsigned c = 0; c < dest_components; ++c) {
- bld.MOV(offset(retype(dest, base_type), bld, c),
- offset(tmp, bld, c));
+ bld.MOV(offset(retype(dest, tmp.type), bld, c),
+ offset(tmp, bld, c));
+ }
+ break;
+ }
+
+ case nir_intrinsic_image_deref_load_param_intel: {
+ nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+ const fs_reg image = get_nir_image_deref(deref);
+ const fs_reg param = offset(image, bld, nir_intrinsic_base(instr) * 4);
+ for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
+ bld.MOV(offset(retype(dest, param.type), bld, c),
+ offset(param, bld, c));
+ }
+ break;
+ }
+
+ case nir_intrinsic_image_deref_load_raw_intel: {
+ const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
+ const fs_reg addr = retype(get_nir_src(instr->src[1]),
+ BRW_REGISTER_TYPE_UD);
+
+ fs_reg tmp = emit_untyped_read(bld, image, addr, 1,
+ instr->num_components);
+
+ for (unsigned c = 0; c < instr->num_components; ++c) {
+ bld.MOV(offset(retype(dest, tmp.type), bld, c),
+ offset(tmp, bld, c));
}
break;
}
+ case nir_intrinsic_image_deref_store_raw_intel: {
+ const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
+ const fs_reg addr = retype(get_nir_src(instr->src[1]),
+ BRW_REGISTER_TYPE_UD);
+ const fs_reg data = retype(get_nir_src(instr->src[2]),
+ BRW_REGISTER_TYPE_UD);
+
+ brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+ emit_untyped_write(bld, image, addr, data, 1,
+ instr->num_components);
+ break;
+ }
+
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier_shared:
case nir_intrinsic_memory_barrier_atomic_counter:
break;
}
- case nir_intrinsic_image_deref_size: {
- /* Get the referenced image variable and type. */
- nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
- const nir_variable *var = nir_deref_instr_get_variable(deref);
- const glsl_type *type = var->type->without_array();
-
- /* Get the size of the image. */
- const fs_reg image = get_nir_image_deref(deref);
- const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
- /* For 1DArray image types, the array index is stored in the Z component.
- * Fix this by swizzling the Z component to the Y component.
- */
- const bool is_1d_array_image =
- type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
- type->sampler_array;
-
- /* For CubeArray images, we should count the number of cubes instead
- * of the number of faces. Fix it by dividing the (Z component) by 6.
- */
- const bool is_cube_array_image =
- type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
- type->sampler_array;
-
- /* Copy all the components. */
- for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
- if ((int)c >= type->coordinate_components()) {
- bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
- brw_imm_d(1));
- } else if (c == 1 && is_1d_array_image) {
- bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
- offset(size, bld, 2));
- } else if (c == 2 && is_cube_array_image) {
- bld.emit(SHADER_OPCODE_INT_QUOTIENT,
- offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
- offset(size, bld, c), brw_imm_d(6));
- } else {
- bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
- offset(size, bld, c));
- }
- }
-
- break;
- }
-
case nir_intrinsic_image_deref_samples:
/* The driver does not support multi-sampled images. */
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
}
}
}
-
-namespace {
- namespace image_format_info {
- /* The higher compiler layers use the GL enums for image formats even if
- * they come in from SPIR-V or Vulkan. We need to turn them into an ISL
- * enum before we can use them.
- */
- static enum isl_format
- isl_format_for_gl_format(uint32_t gl_format)
- {
- switch (gl_format) {
- case GL_R8: return ISL_FORMAT_R8_UNORM;
- case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM;
- case GL_R8UI: return ISL_FORMAT_R8_UINT;
- case GL_R8I: return ISL_FORMAT_R8_SINT;
- case GL_RG8: return ISL_FORMAT_R8G8_UNORM;
- case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM;
- case GL_RG8UI: return ISL_FORMAT_R8G8_UINT;
- case GL_RG8I: return ISL_FORMAT_R8G8_SINT;
- case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM;
- case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM;
- case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT;
- case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT;
- case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
- case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM;
- case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT;
- case GL_R16: return ISL_FORMAT_R16_UNORM;
- case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM;
- case GL_R16F: return ISL_FORMAT_R16_FLOAT;
- case GL_R16UI: return ISL_FORMAT_R16_UINT;
- case GL_R16I: return ISL_FORMAT_R16_SINT;
- case GL_RG16: return ISL_FORMAT_R16G16_UNORM;
- case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM;
- case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT;
- case GL_RG16UI: return ISL_FORMAT_R16G16_UINT;
- case GL_RG16I: return ISL_FORMAT_R16G16_SINT;
- case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM;
- case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM;
- case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT;
- case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT;
- case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT;
- case GL_R32F: return ISL_FORMAT_R32_FLOAT;
- case GL_R32UI: return ISL_FORMAT_R32_UINT;
- case GL_R32I: return ISL_FORMAT_R32_SINT;
- case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT;
- case GL_RG32UI: return ISL_FORMAT_R32G32_UINT;
- case GL_RG32I: return ISL_FORMAT_R32G32_SINT;
- case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT;
- case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT;
- case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT;
- case GL_NONE: return ISL_FORMAT_UNSUPPORTED;
- default:
- assert(!"Invalid image format");
- return ISL_FORMAT_UNSUPPORTED;
- }
- }
-
- /**
- * Simple 4-tuple of scalars used to pass around per-color component
- * values.
- */
- struct color_u {
- color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
- {
- }
-
- color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
- r(r), g(g), b(b), a(a)
- {
- }
-
- unsigned
- operator[](unsigned i) const
- {
- const unsigned xs[] = { r, g, b, a };
- return xs[i];
- }
-
- unsigned r, g, b, a;
- };
-
- /**
- * Return the per-channel bitfield widths for a given image format.
- */
- inline color_u
- get_bit_widths(isl_format format)
- {
- const isl_format_layout *fmtl = isl_format_get_layout(format);
-
- return color_u(fmtl->channels.r.bits,
- fmtl->channels.g.bits,
- fmtl->channels.b.bits,
- fmtl->channels.a.bits);
- }
-
- /**
- * Return the per-channel bitfield shifts for a given image format.
- */
- inline color_u
- get_bit_shifts(isl_format format)
- {
- const color_u widths = get_bit_widths(format);
- return color_u(0, widths.r, widths.r + widths.g,
- widths.r + widths.g + widths.b);
- }
-
- /**
- * Return true if all present components have the same bit width.
- */
- inline bool
- is_homogeneous(isl_format format)
- {
- const color_u widths = get_bit_widths(format);
- return ((widths.g == 0 || widths.g == widths.r) &&
- (widths.b == 0 || widths.b == widths.r) &&
- (widths.a == 0 || widths.a == widths.r));
- }
-
- /**
- * Return true if the format conversion boils down to a trivial copy.
- */
- inline bool
- is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
- {
- return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
- format == isl_lower_storage_image_format(devinfo, format);
- }
-
- /**
- * Return true if the hardware natively supports some format with
- * compatible bitfield layout, but possibly different data types.
- */
- inline bool
- has_supported_bit_layout(const gen_device_info *devinfo,
- isl_format format)
- {
- const color_u widths = get_bit_widths(format);
- const color_u lower_widths = get_bit_widths(
- isl_lower_storage_image_format(devinfo, format));
-
- return (widths.r == lower_widths.r &&
- widths.g == lower_widths.g &&
- widths.b == lower_widths.b &&
- widths.a == lower_widths.a);
- }
-
- /**
- * Return true if we are required to spread individual components over
- * several components of the format used by the hardware (RG32 and
- * friends implemented as RGBA16UI).
- */
- inline bool
- has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
- {
- const isl_format lower_format =
- isl_lower_storage_image_format(devinfo, format);
-
- return (isl_format_get_num_channels(format) <
- isl_format_get_num_channels(lower_format));
- }
-
- /**
- * Return true if the hardware returns garbage in the unused high bits
- * of each component. This may happen on IVB because we rely on the
- * undocumented behavior that typed reads from surfaces of the
- * unsupported R8 and R16 formats return useful data in their least
- * significant bits.
- */
- inline bool
- has_undefined_high_bits(const gen_device_info *devinfo,
- isl_format format)
- {
- const isl_format lower_format =
- isl_lower_storage_image_format(devinfo, format);
-
- return (devinfo->gen == 7 && !devinfo->is_haswell &&
- (lower_format == ISL_FORMAT_R16_UINT ||
- lower_format == ISL_FORMAT_R8_UINT));
- }
-
- /**
- * Return true if the format represents values as signed integers
- * requiring sign extension when unpacking.
- */
- inline bool
- needs_sign_extension(isl_format format)
- {
- return isl_format_has_snorm_channel(format) ||
- isl_format_has_sint_channel(format);
- }
- }
-
- namespace image_validity {
- /**
- * Check whether the bound image is suitable for untyped access.
- */
- static brw_predicate
- emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
- brw_predicate pred)
- {
- const gen_device_info *devinfo = bld.shader->devinfo;
- const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
-
- if (devinfo->gen == 7 && !devinfo->is_haswell) {
- /* Check whether the first stride component (i.e. the Bpp value)
- * is greater than four, what on Gen7 indicates that a surface of
- * type RAW has been bound for untyped access. Reading or writing
- * to a surface of type other than RAW using untyped surface
- * messages causes a hang on IVB and VLV.
- */
- set_predicate(pred,
- bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
- BRW_CONDITIONAL_G));
-
- return BRW_PREDICATE_NORMAL;
- } else {
- /* More recent generations handle the format mismatch
- * gracefully.
- */
- return pred;
- }
- }
-
- /**
- * Check whether there is an image bound at the given index and write
- * the comparison result to f0.0. Returns an appropriate predication
- * mode to use on subsequent image operations.
- */
- static brw_predicate
- emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
- {
- const gen_device_info *devinfo = bld.shader->devinfo;
- const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
- if (devinfo->gen == 7 && !devinfo->is_haswell) {
- /* Check the first component of the size field to find out if the
- * image is bound. Necessary on IVB for typed atomics because
- * they don't seem to respect null surfaces and will happily
- * corrupt or read random memory when no image is bound.
- */
- bld.CMP(bld.null_reg_ud(),
- retype(size, BRW_REGISTER_TYPE_UD),
- brw_imm_d(0), BRW_CONDITIONAL_NZ);
-
- return BRW_PREDICATE_NORMAL;
- } else {
- /* More recent platforms implement compliant behavior when a null
- * surface is bound.
- */
- return BRW_PREDICATE_NONE;
- }
- }
-
- /**
- * Check whether the provided coordinates are within the image bounds
- * and write the comparison result to f0.0. Returns an appropriate
- * predication mode to use on subsequent image operations.
- */
- static brw_predicate
- emit_bounds_check(const fs_builder &bld, const fs_reg &image,
- const fs_reg &addr, unsigned dims)
- {
- const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
-
- for (unsigned c = 0; c < dims; ++c)
- set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
- bld.CMP(bld.null_reg_ud(),
- offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
- offset(size, bld, c),
- BRW_CONDITIONAL_L));
-
- return BRW_PREDICATE_NORMAL;
- }
- }
-
- namespace image_coordinates {
- /**
- * Return the total number of coordinates needed to address a texel of
- * the surface, which may be more than the sum of \p surf_dims and \p
- * arr_dims if padding is required.
- */
- static unsigned
- num_image_coordinates(const fs_builder &bld,
- unsigned surf_dims, unsigned arr_dims,
- isl_format format)
- {
- /* HSW in vec4 mode and our software coordinate handling for untyped
- * reads want the array index to be at the Z component.
- */
- const bool array_index_at_z =
- format != ISL_FORMAT_UNSUPPORTED &&
- !isl_has_matching_typed_storage_image_format(
- bld.shader->devinfo, format);
- const unsigned zero_dims =
- ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
-
- return surf_dims + zero_dims + arr_dims;
- }
-
- /**
- * Transform image coordinates into the form expected by the
- * implementation.
- */
- static fs_reg
- emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
- unsigned surf_dims, unsigned arr_dims,
- isl_format format)
- {
- const unsigned dims =
- num_image_coordinates(bld, surf_dims, arr_dims, format);
-
- if (dims > surf_dims + arr_dims) {
- assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
- /* The array index is required to be passed in as the Z component,
- * insert a zero at the Y component to shift it to the right
- * position.
- *
- * FINISHME: Factor out this frequently recurring pattern into a
- * helper function.
- */
- const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
- const fs_reg dst = bld.vgrf(addr.type, dims);
- bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
- return dst;
- } else {
- return addr;
- }
- }
-
- /**
- * Calculate the offset in memory of the texel given by \p coord.
- *
- * This is meant to be used with untyped surface messages to access a
- * tiled surface, what involves taking into account the tiling and
- * swizzling modes of the surface manually so it will hopefully not
- * happen very often.
- *
- * The tiling algorithm implemented here matches either the X or Y
- * tiling layouts supported by the hardware depending on the tiling
- * coefficients passed to the program as uniforms. See Volume 1 Part 2
- * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
- * explanation of the hardware tiling format.
- */
- static fs_reg
- emit_address_calculation(const fs_builder &bld, const fs_reg &image,
- const fs_reg &coord, unsigned dims)
- {
- const gen_device_info *devinfo = bld.shader->devinfo;
- const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
- const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
- const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
- const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
- const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
- /* Shift the coordinates by the fixed surface offset. It may be
- * non-zero if the image is a single slice of a higher-dimensional
- * surface, or if a non-zero mipmap level of the surface is bound to
- * the pipeline. The offset needs to be applied here rather than at
- * surface state set-up time because the desired slice-level may
- * start mid-tile, so simply shifting the surface base address
- * wouldn't give a well-formed tiled surface in the general case.
- */
- for (unsigned c = 0; c < 2; ++c)
- bld.ADD(offset(addr, bld, c), offset(off, bld, c),
- (c < dims ?
- offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
- fs_reg(brw_imm_d(0))));
-
- /* The layout of 3-D textures in memory is sort-of like a tiling
- * format. At each miplevel, the slices are arranged in rows of
- * 2^level slices per row. The slice row is stored in tmp.y and
- * the slice within the row is stored in tmp.x.
- *
- * The layout of 2-D array textures and cubemaps is much simpler:
- * Depending on whether the ARYSPC_LOD0 layout is in use it will be
- * stored in memory as an array of slices, each one being a 2-D
- * arrangement of miplevels, or as a 2D arrangement of miplevels,
- * each one being an array of slices. In either case the separation
- * between slices of the same LOD is equal to the qpitch value
- * provided as stride.w.
- *
- * This code can be made to handle either 2D arrays and 3D textures
- * by passing in the miplevel as tile.z for 3-D textures and 0 in
- * tile.z for 2-D array textures.
- *
- * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
- * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
- * of the hardware 3D texture and 2D array layouts.
- */
- if (dims > 2) {
- /* Decompose z into a major (tmp.y) and a minor (tmp.x)
- * index.
- */
- bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
- offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
- bld.SHR(offset(tmp, bld, 1),
- offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
- offset(tile, bld, 2));
-
- /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
- * slice offset.
- */
- for (unsigned c = 0; c < 2; ++c) {
- bld.MUL(offset(tmp, bld, c),
- offset(stride, bld, 2 + c), offset(tmp, bld, c));
- bld.ADD(offset(addr, bld, c),
- offset(addr, bld, c), offset(tmp, bld, c));
- }
- }
-
- if (dims > 1) {
- /* Calculate the major/minor x and y indices. In order to
- * accommodate both X and Y tiling, the Y-major tiling format is
- * treated as being a bunch of narrow X-tiles placed next to each
- * other. This means that the tile width for Y-tiling is actually
- * the width of one sub-column of the Y-major tile where each 4K
- * tile has 8 512B sub-columns.
- *
- * The major Y value is the row of tiles in which the pixel lives.
- * The major X value is the tile sub-column in which the pixel
- * lives; for X tiling, this is the same as the tile column, for Y
- * tiling, each tile has 8 sub-columns. The minor X and Y indices
- * are the position within the sub-column.
- */
- for (unsigned c = 0; c < 2; ++c) {
- /* Calculate the minor x and y indices. */
- bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
- brw_imm_d(0), offset(addr, bld, c));
-
- /* Calculate the major x and y indices. */
- bld.SHR(offset(major, bld, c),
- offset(addr, bld, c), offset(tile, bld, c));
- }
-
- /* Calculate the texel index from the start of the tile row and
- * the vertical coordinate of the row.
- * Equivalent to:
- * tmp.x = (major.x << tile.y << tile.x) +
- * (minor.y << tile.x) + minor.x
- * tmp.y = major.y << tile.y
- */
- bld.SHL(tmp, major, offset(tile, bld, 1));
- bld.ADD(tmp, tmp, offset(minor, bld, 1));
- bld.SHL(tmp, tmp, offset(tile, bld, 0));
- bld.ADD(tmp, tmp, minor);
- bld.SHL(offset(tmp, bld, 1),
- offset(major, bld, 1), offset(tile, bld, 1));
-
- /* Add it to the start of the tile row. */
- bld.MUL(offset(tmp, bld, 1),
- offset(tmp, bld, 1), offset(stride, bld, 1));
- bld.ADD(tmp, tmp, offset(tmp, bld, 1));
-
- /* Multiply by the Bpp value. */
- bld.MUL(dst, tmp, stride);
-
- if (devinfo->gen < 8 && !devinfo->is_baytrail) {
- /* Take into account the two dynamically specified shifts.
- * Both need are used to implement swizzling of X-tiled
- * surfaces. For Y-tiled surfaces only one bit needs to be
- * XOR-ed with bit 6 of the memory address, so a swz value of
- * 0xff (actually interpreted as 31 by the hardware) will be
- * provided to cause the relevant bit of tmp.y to be zero and
- * turn the first XOR into the identity. For linear surfaces
- * or platforms lacking address swizzling both shifts will be
- * 0xff causing the relevant bits of both tmp.x and .y to be
- * zero, what effectively disables swizzling.
- */
- for (unsigned c = 0; c < 2; ++c)
- bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
-
- /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
- bld.XOR(tmp, tmp, offset(tmp, bld, 1));
- bld.AND(tmp, tmp, brw_imm_d(1 << 6));
- bld.XOR(dst, dst, tmp);
- }
-
- } else {
- /* Multiply by the Bpp/stride value. Note that the addr.y may be
- * non-zero even if the image is one-dimensional because a
- * vertical offset may have been applied above to select a
- * non-zero slice or level of a higher-dimensional texture.
- */
- bld.MUL(offset(addr, bld, 1),
- offset(addr, bld, 1), offset(stride, bld, 1));
- bld.ADD(addr, addr, offset(addr, bld, 1));
- bld.MUL(dst, addr, stride);
- }
-
- return dst;
- }
- }
-
- namespace image_format_conversion {
- using image_format_info::color_u;
-
- namespace {
- /**
- * Maximum representable value in an unsigned integer with the given
- * number of bits.
- */
- inline unsigned
- scale(unsigned n)
- {
- return (1 << n) - 1;
- }
- }
-
- /**
- * Pack the vector \p src in a bitfield given the per-component bit
- * shifts and widths. Note that bitfield components are not allowed to
- * cross 32-bit boundaries.
- */
- static fs_reg
- emit_pack(const fs_builder &bld, const fs_reg &src,
- const color_u &shifts, const color_u &widths)
- {
- const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
- bool seen[4] = {};
-
- for (unsigned c = 0; c < 4; ++c) {
- if (widths[c]) {
- const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
-
- /* Shift each component left to the correct bitfield position. */
- bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
-
- /* Add everything up. */
- if (seen[shifts[c] / 32]) {
- bld.OR(offset(dst, bld, shifts[c] / 32),
- offset(dst, bld, shifts[c] / 32), tmp);
- } else {
- bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
- seen[shifts[c] / 32] = true;
- }
- }
- }
-
- return dst;
- }
-
- /**
- * Unpack a vector from the bitfield \p src given the per-component bit
- * shifts and widths. Note that bitfield components are not allowed to
- * cross 32-bit boundaries.
- */
- static fs_reg
- emit_unpack(const fs_builder &bld, const fs_reg &src,
- const color_u &shifts, const color_u &widths)
- {
- const fs_reg dst = bld.vgrf(src.type, 4);
-
- for (unsigned c = 0; c < 4; ++c) {
- if (widths[c]) {
- /* Shift left to discard the most significant bits. */
- bld.SHL(offset(dst, bld, c),
- offset(src, bld, shifts[c] / 32),
- brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
-
- /* Shift back to the least significant bits using an arithmetic
- * shift to get sign extension on signed types.
- */
- bld.ASR(offset(dst, bld, c),
- offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
- }
- }
-
- return dst;
- }
-
- /**
- * Convert an integer vector into another integer vector of the
- * specified bit widths, properly handling overflow.
- */
- static fs_reg
- emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
- const color_u &widths, bool is_signed)
- {
- const unsigned s = (is_signed ? 1 : 0);
- const fs_reg dst = bld.vgrf(
- is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
- assert(src.type == dst.type);
-
- for (unsigned c = 0; c < 4; ++c) {
- if (widths[c]) {
- /* Clamp to the maximum value. */
- bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
- brw_imm_d((int)scale(widths[c] - s)),
- BRW_CONDITIONAL_L);
-
- /* Clamp to the minimum value. */
- if (is_signed)
- bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
- brw_imm_d(-(int)scale(widths[c] - s) - 1),
- BRW_CONDITIONAL_GE);
-
- /* Mask off all but the bits we actually want. Otherwise, if
- * we pass a negative number into the hardware when it's
- * expecting something like UINT8, it will happily clamp it to
- * +255 for us.
- */
- if (is_signed && widths[c] < 32)
- bld.AND(offset(dst, bld, c), offset(dst, bld, c),
- brw_imm_d(scale(widths[c])));
- }
- }
-
- return dst;
- }
-
- /**
- * Convert a normalized fixed-point vector of the specified signedness
- * and bit widths into a floating point vector.
- */
- static fs_reg
- emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
- const color_u &widths, bool is_signed)
- {
- const unsigned s = (is_signed ? 1 : 0);
- const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
-
- for (unsigned c = 0; c < 4; ++c) {
- if (widths[c]) {
- /* Convert to float. */
- bld.MOV(offset(dst, bld, c), offset(src, bld, c));
-
- /* Divide by the normalization constants. */
- bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
- brw_imm_f(1.0f / scale(widths[c] - s)));
-
- /* Clamp to the minimum value. */
- if (is_signed)
- bld.emit_minmax(offset(dst, bld, c),
- offset(dst, bld, c), brw_imm_f(-1.0f),
- BRW_CONDITIONAL_GE);
- }
- }
- return dst;
- }
-
- /**
- * Convert a floating-point vector into a normalized fixed-point vector
- * of the specified signedness and bit widths.
- */
- static fs_reg
- emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
- const color_u &widths, bool is_signed)
- {
- const unsigned s = (is_signed ? 1 : 0);
- const fs_reg dst = bld.vgrf(
- is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
- const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
- for (unsigned c = 0; c < 4; ++c) {
- if (widths[c]) {
- /* Clamp the normalized floating-point argument. */
- if (is_signed) {
- bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
- brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
-
- bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
- brw_imm_f(1.0f), BRW_CONDITIONAL_L);
- } else {
- set_saturate(true, bld.MOV(offset(fdst, bld, c),
- offset(src, bld, c)));
- }
-
- /* Multiply by the normalization constants. */
- bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
- brw_imm_f((float)scale(widths[c] - s)));
-
- /* Convert to integer. */
- bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
- bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
-
- /* Mask off all but the bits we actually want. Otherwise, if
- * we pass a negative number into the hardware when it's
- * expecting something like UINT8, it will happily clamp it to
- * +255 for us.
- */
- if (is_signed && widths[c] < 32)
- bld.AND(offset(dst, bld, c), offset(dst, bld, c),
- brw_imm_d(scale(widths[c])));
- }
- }
-
- return dst;
- }
-
- /**
- * Convert a floating point vector of the specified bit widths into a
- * 32-bit floating point vector.
- */
- static fs_reg
- emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
- const color_u &widths)
- {
- const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
- const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
- for (unsigned c = 0; c < 4; ++c) {
- if (widths[c]) {
- bld.MOV(offset(dst, bld, c), offset(src, bld, c));
-
- /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
- * This works because they have a 5-bit exponent just like the
- * 16-bit floating point format, and they have no sign bit.
- */
- if (widths[c] < 16)
- bld.SHL(offset(dst, bld, c),
- offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
-
- /* Convert to 32-bit floating point. */
- bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
- }
- }
-
- return fdst;
- }
-
- /**
- * Convert a vector into a floating point vector of the specified bit
- * widths.
- */
- static fs_reg
- emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
- const color_u &widths)
- {
- const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
- const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
-
- for (unsigned c = 0; c < 4; ++c) {
- if (widths[c]) {
- bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
-
- /* Clamp to the minimum value. */
- if (widths[c] < 16)
- bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
- brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
-
- /* Convert to 16-bit floating-point. */
- bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
-
- /* Discard the least significant bits to get floating point
- * numbers of the requested width. This works because the
- * 10-bit and 11-bit floating point formats have a 5-bit
- * exponent just like the 16-bit format, and they have no sign
- * bit.
- */
- if (widths[c] < 16)
- bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
- brw_imm_ud(15 - widths[c]));
- }
- }
-
- return dst;
- }
-
- /**
- * Fill missing components of a vector with 0, 0, 0, 1.
- */
- static fs_reg
- emit_pad(const fs_builder &bld, const fs_reg &src,
- const color_u &widths)
- {
- const fs_reg dst = bld.vgrf(src.type, 4);
- const unsigned pad[] = { 0, 0, 0, 1 };
-
- for (unsigned c = 0; c < 4; ++c)
- bld.MOV(offset(dst, bld, c),
- widths[c] ? offset(src, bld, c)
- : fs_reg(brw_imm_ud(pad[c])));
-
- return dst;
- }
- }
-}
-
-namespace brw {
- namespace image_access {
- /**
- * Load a vector from a surface of the given format and dimensionality
- * at the given coordinates. \p surf_dims and \p arr_dims give the
- * number of non-array and array coordinates of the image respectively.
- */
- fs_reg
- emit_image_load(const fs_builder &bld,
- const fs_reg &image, const fs_reg &addr,
- unsigned surf_dims, unsigned arr_dims,
- unsigned gl_format)
- {
- using namespace image_format_info;
- using namespace image_format_conversion;
- using namespace image_validity;
- using namespace image_coordinates;
- using namespace surface_access;
- const gen_device_info *devinfo = bld.shader->devinfo;
- const isl_format format = isl_format_for_gl_format(gl_format);
- const isl_format lower_format =
- isl_lower_storage_image_format(devinfo, format);
- fs_reg tmp;
-
- /* Transform the image coordinates into actual surface coordinates. */
- const fs_reg saddr =
- emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
- const unsigned dims =
- num_image_coordinates(bld, surf_dims, arr_dims, format);
-
- if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
- /* Hopefully we get here most of the time... */
- tmp = emit_typed_read(bld, image, saddr, dims,
- isl_format_get_num_channels(lower_format));
- } else {
- /* Untyped surface reads return 32 bits of the surface per
- * component, without any sort of unpacking or type conversion,
- */
- const unsigned size = isl_format_get_layout(format)->bpb / 32;
- /* they don't properly handle out of bounds access, so we have to
- * check manually if the coordinates are valid and predicate the
- * surface read on the result,
- */
- const brw_predicate pred =
- emit_untyped_image_check(bld, image,
- emit_bounds_check(bld, image,
- saddr, dims));
-
- /* and they don't know about surface coordinates, we need to
- * convert them to a raw memory offset.
- */
- const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
-
- tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
-
- /* An out of bounds surface access should give zero as result. */
- for (unsigned c = 0; c < size; ++c)
- set_predicate(pred, bld.SEL(offset(tmp, bld, c),
- offset(tmp, bld, c), brw_imm_d(0)));
- }
-
- /* Set the register type to D instead of UD if the data type is
- * represented as a signed integer in memory so that sign extension
- * is handled correctly by unpack.
- */
- if (needs_sign_extension(format))
- tmp = retype(tmp, BRW_REGISTER_TYPE_D);
-
- if (!has_supported_bit_layout(devinfo, format)) {
- /* Unpack individual vector components from the bitfield if the
- * hardware is unable to do it for us.
- */
- if (has_split_bit_layout(devinfo, format))
- tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
- get_bit_widths(lower_format));
- else
- tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
- get_bit_widths(format));
-
- } else if ((needs_sign_extension(format) &&
- !is_conversion_trivial(devinfo, format)) ||
- has_undefined_high_bits(devinfo, format)) {
- /* Perform a trivial unpack even though the bit layout matches in
- * order to get the most significant bits of each component
- * initialized properly.
- */
- tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
- get_bit_widths(format));
- }
-
- if (!isl_format_has_int_channel(format)) {
- if (is_conversion_trivial(devinfo, format)) {
- /* Just need to cast the vector to the target type. */
- tmp = retype(tmp, BRW_REGISTER_TYPE_F);
- } else {
- /* Do the right sort of type conversion to float. */
- if (isl_format_has_float_channel(format))
- tmp = emit_convert_from_float(
- bld, tmp, get_bit_widths(format));
- else
- tmp = emit_convert_from_scaled(
- bld, tmp, get_bit_widths(format),
- isl_format_has_snorm_channel(format));
- }
- }
-
- /* Initialize missing components of the result. */
- return emit_pad(bld, tmp, get_bit_widths(format));
- }
-
- /**
- * Store a vector in a surface of the given format and dimensionality at
- * the given coordinates. \p surf_dims and \p arr_dims give the number
- * of non-array and array coordinates of the image respectively.
- */
- void
- emit_image_store(const fs_builder &bld, const fs_reg &image,
- const fs_reg &addr, const fs_reg &src,
- unsigned surf_dims, unsigned arr_dims,
- unsigned gl_format)
- {
- using namespace image_format_info;
- using namespace image_format_conversion;
- using namespace image_validity;
- using namespace image_coordinates;
- using namespace surface_access;
- const isl_format format = isl_format_for_gl_format(gl_format);
- const gen_device_info *devinfo = bld.shader->devinfo;
-
- /* Transform the image coordinates into actual surface coordinates. */
- const fs_reg saddr =
- emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
- const unsigned dims =
- num_image_coordinates(bld, surf_dims, arr_dims, format);
-
- if (gl_format == GL_NONE) {
- /* We don't know what the format is, but that's fine because it
- * implies write-only access, and typed surface writes are always
- * able to take care of type conversion and packing for us.
- */
- emit_typed_write(bld, image, saddr, src, dims, 4);
-
- } else {
- const isl_format lower_format =
- isl_lower_storage_image_format(devinfo, format);
- fs_reg tmp = src;
-
- if (!is_conversion_trivial(devinfo, format)) {
- /* Do the right sort of type conversion. */
- if (isl_format_has_float_channel(format))
- tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
-
- else if (isl_format_has_int_channel(format))
- tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
- isl_format_has_sint_channel(format));
-
- else
- tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
- isl_format_has_snorm_channel(format));
- }
-
- /* We're down to bit manipulation at this point. */
- tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
-
- if (!has_supported_bit_layout(devinfo, format)) {
- /* Pack the vector components into a bitfield if the hardware
- * is unable to do it for us.
- */
- if (has_split_bit_layout(devinfo, format))
- tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
- get_bit_widths(lower_format));
-
- else
- tmp = emit_pack(bld, tmp, get_bit_shifts(format),
- get_bit_widths(format));
- }
-
- if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
- /* Hopefully we get here most of the time... */
- emit_typed_write(bld, image, saddr, tmp, dims,
- isl_format_get_num_channels(lower_format));
-
- } else {
- /* Untyped surface writes store 32 bits of the surface per
- * component, without any sort of packing or type conversion,
- */
- const unsigned size = isl_format_get_layout(format)->bpb / 32;
-
- /* they don't properly handle out of bounds access, so we have
- * to check manually if the coordinates are valid and predicate
- * the surface write on the result,
- */
- const brw_predicate pred =
- emit_untyped_image_check(bld, image,
- emit_bounds_check(bld, image,
- saddr, dims));
-
- /* and, phew, they don't know about surface coordinates, we
- * need to convert them to a raw memory offset.
- */
- const fs_reg laddr = emit_address_calculation(
- bld, image, saddr, dims);
-
- emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
- }
- }
- }
-
- /**
- * Perform an atomic read-modify-write operation in a surface of the
- * given dimensionality at the given coordinates. \p surf_dims and \p
- * arr_dims give the number of non-array and array coordinates of the
- * image respectively. Main building block of the imageAtomic GLSL
- * built-ins.
- */
- fs_reg
- emit_image_atomic(const fs_builder &bld,
- const fs_reg &image, const fs_reg &addr,
- const fs_reg &src0, const fs_reg &src1,
- unsigned surf_dims, unsigned arr_dims,
- unsigned rsize, unsigned op)
- {
- using namespace image_validity;
- using namespace image_coordinates;
- using namespace surface_access;
- /* Avoid performing an atomic operation on an unbound surface. */
- const brw_predicate pred = emit_typed_atomic_check(bld, image);
-
- /* Transform the image coordinates into actual surface coordinates. */
- const fs_reg saddr =
- emit_image_coordinates(bld, addr, surf_dims, arr_dims,
- ISL_FORMAT_R32_UINT);
- const unsigned dims =
- num_image_coordinates(bld, surf_dims, arr_dims,
- ISL_FORMAT_R32_UINT);
-
- /* Thankfully we can do without untyped atomics here. */
- const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
- dims, rsize, op, pred);
-
- /* An unbound surface access should give zero as result. */
- if (rsize && pred)
- set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
-
- return retype(tmp, src0.type);
- }
- }
-}
unsigned bit_size,
brw_predicate pred = BRW_PREDICATE_NONE);
}
-
- namespace image_access {
- fs_reg
- emit_image_load(const fs_builder &bld,
- const fs_reg &image, const fs_reg &addr,
- unsigned surf_dims, unsigned arr_dims,
- unsigned gl_format);
-
- void
- emit_image_store(const fs_builder &bld, const fs_reg &image,
- const fs_reg &addr, const fs_reg &src,
- unsigned surf_dims, unsigned arr_dims,
- unsigned gl_format);
- fs_reg
- emit_image_atomic(const fs_builder &bld,
- const fs_reg &image, const fs_reg &addr,
- const fs_reg &src0, const fs_reg &src1,
- unsigned surf_dims, unsigned arr_dims,
- unsigned rsize, unsigned op);
- }
}
#endif
GLenum tes_primitive_mode);
void brw_nir_lower_fs_outputs(nir_shader *nir);
+bool brw_nir_lower_image_load_store(nir_shader *nir,
+ const struct gen_device_info *devinfo);
+
nir_shader *brw_postprocess_nir(nir_shader *nir,
const struct brw_compiler *compiler,
bool is_scalar);
--- /dev/null
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl/isl.h"
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+/* The higher compiler layers use the GL enums for image formats even if
+ * they come in from SPIR-V or Vulkan. We need to turn them into an ISL
+ * enum before we can use them.
+ */
+static enum isl_format
+isl_format_for_gl_format(uint32_t gl_format)
+{
+ switch (gl_format) {
+ case GL_R8: return ISL_FORMAT_R8_UNORM;
+ case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM;
+ case GL_R8UI: return ISL_FORMAT_R8_UINT;
+ case GL_R8I: return ISL_FORMAT_R8_SINT;
+ case GL_RG8: return ISL_FORMAT_R8G8_UNORM;
+ case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM;
+ case GL_RG8UI: return ISL_FORMAT_R8G8_UINT;
+ case GL_RG8I: return ISL_FORMAT_R8G8_SINT;
+ case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM;
+ case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM;
+ case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT;
+ case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT;
+ case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
+ case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM;
+ case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT;
+ case GL_R16: return ISL_FORMAT_R16_UNORM;
+ case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM;
+ case GL_R16F: return ISL_FORMAT_R16_FLOAT;
+ case GL_R16UI: return ISL_FORMAT_R16_UINT;
+ case GL_R16I: return ISL_FORMAT_R16_SINT;
+ case GL_RG16: return ISL_FORMAT_R16G16_UNORM;
+ case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM;
+ case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT;
+ case GL_RG16UI: return ISL_FORMAT_R16G16_UINT;
+ case GL_RG16I: return ISL_FORMAT_R16G16_SINT;
+ case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM;
+ case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM;
+ case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT;
+ case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT;
+ case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT;
+ case GL_R32F: return ISL_FORMAT_R32_FLOAT;
+ case GL_R32UI: return ISL_FORMAT_R32_UINT;
+ case GL_R32I: return ISL_FORMAT_R32_SINT;
+ case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT;
+ case GL_RG32UI: return ISL_FORMAT_R32G32_UINT;
+ case GL_RG32I: return ISL_FORMAT_R32G32_SINT;
+ case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT;
+ case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT;
+ case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT;
+ case GL_NONE: return ISL_FORMAT_UNSUPPORTED;
+ default:
+ assert(!"Invalid image format");
+ return ISL_FORMAT_UNSUPPORTED;
+ }
+}
+
+static nir_ssa_def *
+_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
+{
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_image_deref_load_param_intel);
+ load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+ nir_intrinsic_set_base(load, offset / 4);
+
+ switch (offset) {
+ case BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET:
+ load->num_components = 1;
+ break;
+ case BRW_IMAGE_PARAM_OFFSET_OFFSET:
+ case BRW_IMAGE_PARAM_SWIZZLING_OFFSET:
+ load->num_components = 2;
+ break;
+ case BRW_IMAGE_PARAM_TILING_OFFSET:
+ case BRW_IMAGE_PARAM_SIZE_OFFSET:
+ load->num_components = 3;
+ break;
+ case BRW_IMAGE_PARAM_STRIDE_OFFSET:
+ load->num_components = 4;
+ break;
+ default:
+ unreachable("Invalid param offset");
+ }
+ nir_ssa_dest_init(&load->instr, &load->dest,
+ load->num_components, 32, NULL);
+
+ nir_builder_instr_insert(b, &load->instr);
+ return &load->dest.ssa;
+}
+
+#define load_image_param(b, d, o) \
+ _load_image_param(b, d, BRW_IMAGE_PARAM_##o##_OFFSET)
+
+static nir_ssa_def *
+sanitize_image_coord(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *coord)
+{
+ if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
+ glsl_sampler_type_is_array(deref->type)) {
+ /* It's easier if 1D arrays are treated like 2D arrays */
+ return nir_vec3(b, nir_channel(b, coord, 0),
+ nir_imm_int(b, 0),
+ nir_channel(b, coord, 1));
+ } else {
+ unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
+ return nir_channels(b, coord, (1 << dims) - 1);
+ }
+}
+
+static nir_ssa_def *
+image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
+ nir_ssa_def *coord)
+{
+ coord = sanitize_image_coord(b, deref, coord);
+ nir_ssa_def *size = load_image_param(b, deref, SIZE);
+
+ nir_ssa_def *cmp = nir_ilt(b, coord, size);
+ nir_ssa_def *in_bounds = nir_imm_int(b, NIR_TRUE);
+ for (unsigned i = 0; i < coord->num_components; i++)
+ in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
+
+ return in_bounds;
+}
+
+/** Calculate the offset in memory of the texel given by \p coord.
+ *
+ * This is meant to be used with untyped surface messages to access a tiled
+ * surface, what involves taking into account the tiling and swizzling modes
+ * of the surface manually so it will hopefully not happen very often.
+ *
+ * The tiling algorithm implemented here matches either the X or Y tiling
+ * layouts supported by the hardware depending on the tiling coefficients
+ * passed to the program as uniforms. See Volume 1 Part 2 Section 4.5
+ * "Address Tiling Function" of the IVB PRM for an in-depth explanation of
+ * the hardware tiling format.
+ */
+static nir_ssa_def *
+image_address(nir_builder *b, const struct gen_device_info *devinfo,
+ nir_deref_instr *deref, nir_ssa_def *coord)
+{
+ coord = sanitize_image_coord(b, deref, coord);
+
+ nir_ssa_def *offset = load_image_param(b, deref, OFFSET);
+ nir_ssa_def *tiling = load_image_param(b, deref, TILING);
+ nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
+
+ /* Shift the coordinates by the fixed surface offset. It may be non-zero
+ * if the image is a single slice of a higher-dimensional surface, or if a
+ * non-zero mipmap level of the surface is bound to the pipeline. The
+ * offset needs to be applied here rather than at surface state set-up time
+ * because the desired slice-level may start mid-tile, so simply shifting
+ * the surface base address wouldn't give a well-formed tiled surface in
+ * the general case.
+ */
+ nir_ssa_def *xypos = (coord->num_components == 1) ?
+ nir_vec2(b, coord, nir_imm_int(b, 0)) :
+ nir_channels(b, coord, 0x3);
+ xypos = nir_iadd(b, xypos, offset);
+
+ /* The layout of 3-D textures in memory is sort-of like a tiling
+ * format. At each miplevel, the slices are arranged in rows of
+ * 2^level slices per row. The slice row is stored in tmp.y and
+ * the slice within the row is stored in tmp.x.
+ *
+ * The layout of 2-D array textures and cubemaps is much simpler:
+ * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+ * stored in memory as an array of slices, each one being a 2-D
+ * arrangement of miplevels, or as a 2D arrangement of miplevels,
+ * each one being an array of slices. In either case the separation
+ * between slices of the same LOD is equal to the qpitch value
+ * provided as stride.w.
+ *
+ * This code can be made to handle either 2D arrays and 3D textures
+ * by passing in the miplevel as tile.z for 3-D textures and 0 in
+ * tile.z for 2-D array textures.
+ *
+ * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+ * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+ * of the hardware 3D texture and 2D array layouts.
+ */
+ if (coord->num_components > 2) {
+ /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+ * index.
+ */
+ nir_ssa_def *z = nir_channel(b, coord, 2);
+ nir_ssa_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
+ nir_channel(b, tiling, 2));
+ nir_ssa_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
+
+ /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+ * slice offset.
+ */
+ xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
+ nir_channels(b, stride, 0xc)));
+ }
+
+ nir_ssa_def *addr;
+ if (coord->num_components > 1) {
+ /* Calculate the major/minor x and y indices. In order to
+ * accommodate both X and Y tiling, the Y-major tiling format is
+ * treated as being a bunch of narrow X-tiles placed next to each
+ * other. This means that the tile width for Y-tiling is actually
+ * the width of one sub-column of the Y-major tile where each 4K
+ * tile has 8 512B sub-columns.
+ *
+ * The major Y value is the row of tiles in which the pixel lives.
+ * The major X value is the tile sub-column in which the pixel
+ * lives; for X tiling, this is the same as the tile column, for Y
+ * tiling, each tile has 8 sub-columns. The minor X and Y indices
+ * are the position within the sub-column.
+ */
+
+ /* Calculate the minor x and y indices. */
+ nir_ssa_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
+ nir_channels(b, tiling, 0x3));
+ nir_ssa_def *major = nir_ushr(b, xypos, nir_channels(b, tiling, 0x3));
+
+ /* Calculate the texel index from the start of the tile row and the
+ * vertical coordinate of the row.
+ * Equivalent to:
+ * tmp.x = (major.x << tile.y << tile.x) +
+ * (minor.y << tile.x) + minor.x
+ * tmp.y = major.y << tile.y
+ */
+ nir_ssa_def *idx_x, *idx_y;
+ idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
+ idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
+ idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
+ idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
+ idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
+
+ /* Add it to the start of the tile row. */
+ nir_ssa_def *idx;
+ idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
+ idx = nir_iadd(b, idx, idx_x);
+
+ /* Multiply by the Bpp value. */
+ addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+
+ if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+ /* Take into account the two dynamically specified shifts. Both are
+ * used to implement swizzling of X-tiled surfaces. For Y-tiled
+ * surfaces only one bit needs to be XOR-ed with bit 6 of the memory
+ * address, so a swz value of 0xff (actually interpreted as 31 by the
+ * hardware) will be provided to cause the relevant bit of tmp.y to
+ * be zero and turn the first XOR into the identity. For linear
+ * surfaces or platforms lacking address swizzling both shifts will
+ * be 0xff causing the relevant bits of both tmp.x and .y to be zero,
+ * what effectively disables swizzling.
+ */
+ nir_ssa_def *swizzle = load_image_param(b, deref, SWIZZLING);
+ nir_ssa_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
+ nir_ssa_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
+
+ /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+ nir_ssa_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
+ nir_imm_int(b, 1 << 6));
+ addr = nir_ixor(b, addr, bit);
+ }
+ } else {
+ /* Multiply by the Bpp/stride value. Note that the addr.y may be
+ * non-zero even if the image is one-dimensional because a vertical
+ * offset may have been applied above to select a non-zero slice or
+ * level of a higher-dimensional texture.
+ */
+ nir_ssa_def *idx;
+ idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
+ idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
+ addr = nir_imul(b, idx, nir_channel(b, stride, 0));
+ }
+
+ return addr;
+}
+
+struct format_info {
+ const struct isl_format_layout *fmtl;
+ unsigned chans;
+ unsigned bits[4];
+};
+
+static struct format_info
+get_format_info(enum isl_format fmt)
+{
+ const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+ return (struct format_info) {
+ .fmtl = fmtl,
+ .chans = isl_format_get_num_channels(fmt),
+ .bits = {
+ fmtl->channels.r.bits,
+ fmtl->channels.g.bits,
+ fmtl->channels.b.bits,
+ fmtl->channels.a.bits
+ },
+ };
+}
+
+static nir_ssa_def *
+nir_zero_vec(nir_builder *b, unsigned num_components)
+{
+ nir_const_value v;
+ memset(&v, 0, sizeof(v));
+
+ return nir_build_imm(b, num_components, 32, v);
+}
+
+static nir_ssa_def *
+convert_color_for_load(nir_builder *b, const struct gen_device_info *devinfo,
+ nir_ssa_def *color,
+ enum isl_format image_fmt, enum isl_format lower_fmt,
+ unsigned dest_components)
+{
+ if (image_fmt == lower_fmt)
+ goto expand_vec;
+
+ if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+ assert(lower_fmt == ISL_FORMAT_R32_UINT);
+ color = nir_format_unpack_11f11f10f(b, color);
+ goto expand_vec;
+ }
+
+ struct format_info image = get_format_info(image_fmt);
+ struct format_info lower = get_format_info(lower_fmt);
+
+ const bool needs_sign_extension =
+ isl_format_has_snorm_channel(image_fmt) ||
+ isl_format_has_sint_channel(image_fmt);
+
+ /* We only check the red channel to detect if we need to pack/unpack */
+ assert(image.bits[0] != lower.bits[0] ||
+ memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
+
+ if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+ if (needs_sign_extension)
+ color = nir_format_unpack_sint(b, color, image.bits, image.chans);
+ else
+ color = nir_format_unpack_uint(b, color, image.bits, image.chans);
+ } else {
+ /* All these formats are homogeneous */
+ for (unsigned i = 1; i < image.chans; i++)
+ assert(image.bits[i] == image.bits[0]);
+
+ /* On IVB, we rely on the undocumented behavior that typed reads from
+ * surfaces of the unsupported R8 and R16 formats return useful data in
+ * their least significant bits. However, the data in the high bits is
+ * garbage so we have to discard it.
+ */
+ if (devinfo->gen == 7 && !devinfo->is_haswell &&
+ (lower_fmt == ISL_FORMAT_R16_UINT ||
+ lower_fmt == ISL_FORMAT_R8_UINT))
+ color = nir_format_mask_uvec(b, color, lower.bits);
+
+ if (image.bits[0] != lower.bits[0]) {
+ color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
+ image.bits[0]);
+ }
+
+ if (needs_sign_extension)
+ color = nir_format_sign_extend_ivec(b, color, image.bits);
+ }
+
+ switch (image.fmtl->channels.r.type) {
+ case ISL_UNORM:
+ assert(isl_format_has_uint_channel(lower_fmt));
+ color = nir_format_unorm_to_float(b, color, image.bits);
+ break;
+
+ case ISL_SNORM:
+ assert(isl_format_has_uint_channel(lower_fmt));
+ color = nir_format_snorm_to_float(b, color, image.bits);
+ break;
+
+ case ISL_SFLOAT:
+ if (image.bits[0] == 16)
+ color = nir_unpack_half_2x16_split_x(b, color);
+ break;
+
+ case ISL_UINT:
+ case ISL_SINT:
+ break;
+
+ default:
+ unreachable("Invalid image channel type");
+ }
+
+expand_vec:
+ assert(dest_components == 1 || dest_components == 4);
+ assert(color->num_components <= dest_components);
+ if (color->num_components == dest_components)
+ return color;
+
+ nir_ssa_def *comps[4];
+ for (unsigned i = 0; i < color->num_components; i++)
+ comps[i] = nir_channel(b, color, i);
+
+ for (unsigned i = color->num_components; i < 3; i++)
+ comps[i] = nir_imm_int(b, 0);
+
+ if (color->num_components < 4) {
+ if (isl_format_has_int_channel(image_fmt))
+ comps[3] = nir_imm_int(b, 1);
+ else
+ comps[3] = nir_imm_float(b, 1);
+ }
+
+ return nir_vec(b, comps, dest_components);
+}
+
+static bool
+lower_image_load_instr(nir_builder *b,
+ const struct gen_device_info *devinfo,
+ nir_intrinsic_instr *intrin)
+{
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+ const enum isl_format image_fmt =
+ isl_format_for_gl_format(var->data.image.format);
+
+ if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+ const enum isl_format lower_fmt =
+ isl_lower_storage_image_format(devinfo, image_fmt);
+ const unsigned dest_components = intrin->num_components;
+
+ /* Use an undef to hold the uses of the load while we do the color
+ * conversion.
+ */
+ nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
+
+ intrin->num_components = isl_format_get_num_channels(lower_fmt);
+ intrin->dest.ssa.num_components = intrin->num_components;
+
+ b->cursor = nir_after_instr(&intrin->instr);
+
+ nir_ssa_def *color = convert_color_for_load(b, devinfo,
+ &intrin->dest.ssa,
+ image_fmt, lower_fmt,
+ dest_components);
+
+ nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(color));
+ nir_instr_remove(placeholder->parent_instr);
+ } else {
+ const struct isl_format_layout *image_fmtl =
+ isl_format_get_layout(image_fmt);
+ /* We have a matching typed format for everything 32b and below */
+ assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+ enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+ ISL_FORMAT_R32G32_UINT :
+ ISL_FORMAT_R32G32B32A32_UINT;
+ const unsigned dest_components = intrin->num_components;
+
+ b->cursor = nir_instr_remove(&intrin->instr);
+
+ nir_ssa_def *coord = intrin->src[1].ssa;
+
+ nir_ssa_def *do_load = image_coord_is_in_bounds(b, deref, coord);
+ if (devinfo->gen == 7 && !devinfo->is_haswell) {
+ /* Check whether the first stride component (i.e. the Bpp value)
+ * is greater than four, what on Gen7 indicates that a surface of
+ * type RAW has been bound for untyped access. Reading or writing
+ * to a surface of type other than RAW using untyped surface
+ * messages causes a hang on IVB and VLV.
+ */
+ nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
+ nir_ssa_def *is_raw =
+ nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
+ do_load = nir_iand(b, do_load, is_raw);
+ }
+ nir_push_if(b, do_load);
+
+ nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_image_deref_load_raw_intel);
+ load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+ load->src[1] = nir_src_for_ssa(addr);
+ load->num_components = image_fmtl->bpb / 32;
+ nir_ssa_dest_init(&load->instr, &load->dest,
+ load->num_components, 32, NULL);
+ nir_builder_instr_insert(b, &load->instr);
+
+ nir_push_else(b, NULL);
+
+ nir_ssa_def *zero = nir_zero_vec(b, load->num_components);
+
+ nir_pop_if(b, NULL);
+
+ nir_ssa_def *value = nir_if_phi(b, &load->dest.ssa, zero);
+
+ nir_ssa_def *color = convert_color_for_load(b, devinfo, value,
+ image_fmt, raw_fmt,
+ dest_components);
+
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(color));
+ }
+
+ return true;
+}
+
+static nir_ssa_def *
+convert_color_for_store(nir_builder *b, const struct gen_device_info *devinfo,
+ nir_ssa_def *color,
+ enum isl_format image_fmt, enum isl_format lower_fmt)
+{
+ struct format_info image = get_format_info(image_fmt);
+ struct format_info lower = get_format_info(lower_fmt);
+
+ color = nir_channels(b, color, (1 << image.chans) - 1);
+
+ if (image_fmt == lower_fmt)
+ return color;
+
+ if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
+ assert(lower_fmt == ISL_FORMAT_R32_UINT);
+ return nir_format_pack_11f11f10f(b, color);
+ }
+
+ switch (image.fmtl->channels.r.type) {
+ case ISL_UNORM:
+ assert(isl_format_has_uint_channel(lower_fmt));
+ color = nir_format_float_to_unorm(b, color, image.bits);
+ break;
+
+ case ISL_SNORM:
+ assert(isl_format_has_uint_channel(lower_fmt));
+ color = nir_format_float_to_snorm(b, color, image.bits);
+ break;
+
+ case ISL_SFLOAT:
+ if (image.bits[0] == 16) {
+ nir_ssa_def *f16comps[4];
+ for (unsigned i = 0; i < image.chans; i++) {
+ f16comps[i] = nir_pack_half_2x16_split(b, nir_channel(b, color, i),
+ nir_imm_float(b, 0));
+ }
+ color = nir_vec(b, f16comps, image.chans);
+ }
+ break;
+
+ case ISL_UINT:
+ if (image.bits[0] < 32) {
+ nir_const_value max;
+ for (unsigned i = 0; i < image.chans; i++) {
+ assert(image.bits[i] < 32);
+ max.u32[i] = (1u << image.bits[i]) - 1;
+ }
+ color = nir_umin(b, color, nir_build_imm(b, image.chans, 32, max));
+ }
+ break;
+
+ case ISL_SINT:
+ if (image.bits[0] < 32) {
+ nir_const_value min, max;
+ for (unsigned i = 0; i < image.chans; i++) {
+ assert(image.bits[i] < 32);
+ max.i32[i] = (1 << (image.bits[i] - 1)) - 1;
+ min.i32[i] = -(1 << (image.bits[i] - 1));
+ }
+ color = nir_imin(b, color, nir_build_imm(b, image.chans, 32, max));
+ color = nir_imax(b, color, nir_build_imm(b, image.chans, 32, min));
+ }
+ break;
+
+ default:
+ unreachable("Invalid image channel type");
+ }
+
+ if (image.bits[0] < 32 &&
+ (isl_format_has_snorm_channel(image_fmt) ||
+ isl_format_has_sint_channel(image_fmt)))
+ color = nir_format_mask_uvec(b, color, image.bits);
+
+ if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
+ color = nir_format_pack_uint(b, color, image.bits, image.chans);
+ } else {
+ /* All these formats are homogeneous */
+ for (unsigned i = 1; i < image.chans; i++)
+ assert(image.bits[i] == image.bits[0]);
+
+ if (image.bits[0] != lower.bits[0]) {
+ color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
+ lower.bits[0]);
+ }
+ }
+
+ return color;
+}
+
+static bool
+lower_image_store_instr(nir_builder *b,
+ const struct gen_device_info *devinfo,
+ nir_intrinsic_instr *intrin)
+{
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+
+ /* For write-only surfaces, we trust that the hardware can just do the
+ * conversion for us.
+ */
+ if (var->data.image.write_only)
+ return false;
+
+ const enum isl_format image_fmt =
+ isl_format_for_gl_format(var->data.image.format);
+
+ if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
+ const enum isl_format lower_fmt =
+ isl_lower_storage_image_format(devinfo, image_fmt);
+
+ /* Color conversion goes before the store */
+ b->cursor = nir_before_instr(&intrin->instr);
+
+ nir_ssa_def *color = convert_color_for_store(b, devinfo,
+ intrin->src[3].ssa,
+ image_fmt, lower_fmt);
+ intrin->num_components = isl_format_get_num_channels(lower_fmt);
+ nir_instr_rewrite_src(&intrin->instr, &intrin->src[3],
+ nir_src_for_ssa(color));
+ } else {
+ const struct isl_format_layout *image_fmtl =
+ isl_format_get_layout(image_fmt);
+ /* We have a matching typed format for everything 32b and below */
+ assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
+ enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
+ ISL_FORMAT_R32G32_UINT :
+ ISL_FORMAT_R32G32B32A32_UINT;
+
+ b->cursor = nir_instr_remove(&intrin->instr);
+
+ nir_ssa_def *coord = intrin->src[1].ssa;
+
+ nir_ssa_def *do_store = image_coord_is_in_bounds(b, deref, coord);
+ if (devinfo->gen == 7 && !devinfo->is_haswell) {
+ /* Check whether the first stride component (i.e. the Bpp value)
+ * is greater than four, what on Gen7 indicates that a surface of
+ * type RAW has been bound for untyped access. Reading or writing
+ * to a surface of type other than RAW using untyped surface
+ * messages causes a hang on IVB and VLV.
+ */
+ nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
+ nir_ssa_def *is_raw =
+ nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
+ do_store = nir_iand(b, do_store, is_raw);
+ }
+ nir_push_if(b, do_store);
+
+ nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
+ nir_ssa_def *color = convert_color_for_store(b, devinfo,
+ intrin->src[3].ssa,
+ image_fmt, raw_fmt);
+
+ nir_intrinsic_instr *store =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_image_deref_store_raw_intel);
+ store->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+ store->src[1] = nir_src_for_ssa(addr);
+ store->src[2] = nir_src_for_ssa(color);
+ store->num_components = image_fmtl->bpb / 32;
+ nir_builder_instr_insert(b, &store->instr);
+
+ nir_pop_if(b, NULL);
+ }
+
+ return true;
+}
+
+static bool
+lower_image_atomic_instr(nir_builder *b,
+ const struct gen_device_info *devinfo,
+ nir_intrinsic_instr *intrin)
+{
+ if (devinfo->is_haswell || devinfo->gen >= 8)
+ return false;
+
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+ b->cursor = nir_instr_remove(&intrin->instr);
+
+ /* Use an undef to hold the uses of the load conversion. */
+ nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
+
+ /* Check the first component of the size field to find out if the
+ * image is bound. Necessary on IVB for typed atomics because
+ * they don't seem to respect null surfaces and will happily
+ * corrupt or read random memory when no image is bound.
+ */
+ nir_ssa_def *size = load_image_param(b, deref, SIZE);
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
+
+ nir_builder_instr_insert(b, &intrin->instr);
+
+ nir_pop_if(b, NULL);
+
+ nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, zero);
+ nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(result));
+
+ return true;
+}
+
+static bool
+lower_image_size_instr(nir_builder *b,
+ const struct gen_device_info *devinfo,
+ nir_intrinsic_instr *intrin)
+{
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+ b->cursor = nir_instr_remove(&intrin->instr);
+
+ nir_ssa_def *size = load_image_param(b, deref, SIZE);
+
+ nir_ssa_def *comps[4] = { NULL, NULL, NULL, NULL };
+
+ enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type);
+ unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
+ for (unsigned c = 0; c < coord_comps; c++) {
+ if (c == 1 && dim == GLSL_SAMPLER_DIM_1D) {
+ /* The array length for 1D arrays is in .z */
+ comps[1] = nir_channel(b, size, 2);
+ } else if (c == 2 && dim == GLSL_SAMPLER_DIM_CUBE) {
+ comps[2] = nir_idiv(b, nir_channel(b, size, 2), nir_imm_int(b, 6));
+ } else {
+ comps[c] = nir_channel(b, size, c);
+ }
+ }
+
+ for (unsigned c = coord_comps; c < intrin->dest.ssa.num_components; ++c)
+ comps[c] = nir_imm_int(b, 1);
+
+ nir_ssa_def *vec = nir_vec(b, comps, intrin->dest.ssa.num_components);
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(vec));
+
+ return true;
+}
+
+bool
+brw_nir_lower_image_load_store(nir_shader *shader,
+ const struct gen_device_info *devinfo)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl == NULL)
+ continue;
+
+ nir_foreach_block_safe(block, function->impl) {
+ nir_builder b;
+ nir_builder_init(&b, function->impl);
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_image_deref_load:
+ if (lower_image_load_instr(&b, devinfo, intrin))
+ progress = true;
+ break;
+
+ case nir_intrinsic_image_deref_store:
+ if (lower_image_store_instr(&b, devinfo, intrin))
+ progress = true;
+ break;
+
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_min:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ if (lower_image_atomic_instr(&b, devinfo, intrin))
+ progress = true;
+ break;
+
+ case nir_intrinsic_image_deref_size:
+ if (lower_image_size_instr(&b, devinfo, intrin))
+ progress = true;
+ break;
+
+ default:
+ /* Nothing to do */
+ break;
+ }
+ }
+ }
+
+ nir_metadata_preserve(function->impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
+ return progress;
+}
'brw_nir_analyze_ubo_ranges.c',
'brw_nir_attribute_workarounds.c',
'brw_nir_lower_cs_intrinsics.c',
+ 'brw_nir_lower_image_load_store.c',
'brw_nir_opt_peephole_ffma.c',
'brw_nir_tcs_workarounds.c',
'brw_packed_float.c',
if (nir->info.stage != MESA_SHADER_COMPUTE)
brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+ NIR_PASS_V(nir, brw_nir_lower_image_load_store, compiler->devinfo);
+
assert(nir->num_uniforms == prog_data->nr_params * 4);
stage->nir = nir;
nir = brw_preprocess_nir(brw->screen->compiler, nir);
+ NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
+
if (stage == MESA_SHADER_TESS_CTRL) {
/* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
static const gl_state_index16 tokens[STATE_LENGTH] =