2 * Copyright © 2013-2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "brw_fs_surface_builder.h"
31 namespace surface_access
{
34 * Generate a logical send opcode for a surface message and return
38 emit_send(const fs_builder
&bld
, enum opcode opcode
,
39 const fs_reg
&addr
, const fs_reg
&src
, const fs_reg
&surface
,
40 unsigned dims
, unsigned arg
, unsigned rsize
,
41 brw_predicate pred
= BRW_PREDICATE_NONE
)
43 /* Reduce the dynamically uniform surface index to a single
46 const fs_reg usurface
= bld
.emit_uniformize(surface
);
47 const fs_reg srcs
[] = {
48 addr
, src
, usurface
, brw_imm_ud(dims
), brw_imm_ud(arg
)
50 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, rsize
);
51 fs_inst
*inst
= bld
.emit(opcode
, dst
, srcs
, ARRAY_SIZE(srcs
));
53 inst
->size_written
= rsize
* dst
.component_size(inst
->exec_size
);
54 inst
->predicate
= pred
;
60 * Emit an untyped surface read opcode. \p dims determines the number
61 * of components of the address and \p size the number of components of
65 emit_untyped_read(const fs_builder
&bld
,
66 const fs_reg
&surface
, const fs_reg
&addr
,
67 unsigned dims
, unsigned size
,
70 return emit_send(bld
, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL
,
71 addr
, fs_reg(), surface
, dims
, size
, size
, pred
);
75 * Emit an untyped surface write opcode. \p dims determines the number
76 * of components of the address and \p size the number of components of
80 emit_untyped_write(const fs_builder
&bld
, const fs_reg
&surface
,
81 const fs_reg
&addr
, const fs_reg
&src
,
82 unsigned dims
, unsigned size
,
85 emit_send(bld
, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL
,
86 addr
, src
, surface
, dims
, size
, 0, pred
);
90 * Emit an untyped surface atomic opcode. \p dims determines the number
91 * of components of the address and \p rsize the number of components of
92 * the returned value (either zero or one).
95 emit_untyped_atomic(const fs_builder
&bld
,
96 const fs_reg
&surface
, const fs_reg
&addr
,
97 const fs_reg
&src0
, const fs_reg
&src1
,
98 unsigned dims
, unsigned rsize
, unsigned op
,
101 /* FINISHME: Factor out this frequently recurring pattern into a
104 const unsigned n
= (src0
.file
!= BAD_FILE
) + (src1
.file
!= BAD_FILE
);
105 const fs_reg srcs
[] = { src0
, src1
};
106 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, n
);
107 bld
.LOAD_PAYLOAD(tmp
, srcs
, n
, 0);
109 return emit_send(bld
, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL
,
110 addr
, tmp
, surface
, dims
, op
, rsize
, pred
);
114 * Emit a typed surface read opcode. \p dims determines the number of
115 * components of the address and \p size the number of components of the
119 emit_typed_read(const fs_builder
&bld
, const fs_reg
&surface
,
120 const fs_reg
&addr
, unsigned dims
, unsigned size
)
122 return emit_send(bld
, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL
,
123 addr
, fs_reg(), surface
, dims
, size
, size
);
127 * Emit a typed surface write opcode. \p dims determines the number of
128 * components of the address and \p size the number of components of the
132 emit_typed_write(const fs_builder
&bld
, const fs_reg
&surface
,
133 const fs_reg
&addr
, const fs_reg
&src
,
134 unsigned dims
, unsigned size
)
136 emit_send(bld
, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL
,
137 addr
, src
, surface
, dims
, size
, 0);
141 * Emit a typed surface atomic opcode. \p dims determines the number of
142 * components of the address and \p rsize the number of components of
143 * the returned value (either zero or one).
146 emit_typed_atomic(const fs_builder
&bld
, const fs_reg
&surface
,
148 const fs_reg
&src0
, const fs_reg
&src1
,
149 unsigned dims
, unsigned rsize
, unsigned op
,
152 /* FINISHME: Factor out this frequently recurring pattern into a
155 const unsigned n
= (src0
.file
!= BAD_FILE
) + (src1
.file
!= BAD_FILE
);
156 const fs_reg srcs
[] = { src0
, src1
};
157 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, n
);
158 bld
.LOAD_PAYLOAD(tmp
, srcs
, n
, 0);
160 return emit_send(bld
, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL
,
161 addr
, tmp
, surface
, dims
, op
, rsize
);
167 namespace image_format_info
{
168 /* The higher compiler layers use the GL enums for image formats even if
169 * they come in from SPIR-V or Vulkan. We need to turn them into an ISL
170 * enum before we can use them.
173 isl_format_for_gl_format(uint32_t gl_format
)
176 case GL_R8
: return ISL_FORMAT_R8_UNORM
;
177 case GL_R8_SNORM
: return ISL_FORMAT_R8_SNORM
;
178 case GL_R8UI
: return ISL_FORMAT_R8_UINT
;
179 case GL_R8I
: return ISL_FORMAT_R8_SINT
;
180 case GL_RG8
: return ISL_FORMAT_R8G8_UNORM
;
181 case GL_RG8_SNORM
: return ISL_FORMAT_R8G8_SNORM
;
182 case GL_RG8UI
: return ISL_FORMAT_R8G8_UINT
;
183 case GL_RG8I
: return ISL_FORMAT_R8G8_SINT
;
184 case GL_RGBA8
: return ISL_FORMAT_R8G8B8A8_UNORM
;
185 case GL_RGBA8_SNORM
: return ISL_FORMAT_R8G8B8A8_SNORM
;
186 case GL_RGBA8UI
: return ISL_FORMAT_R8G8B8A8_UINT
;
187 case GL_RGBA8I
: return ISL_FORMAT_R8G8B8A8_SINT
;
188 case GL_R11F_G11F_B10F
: return ISL_FORMAT_R11G11B10_FLOAT
;
189 case GL_RGB10_A2
: return ISL_FORMAT_R10G10B10A2_UNORM
;
190 case GL_RGB10_A2UI
: return ISL_FORMAT_R10G10B10A2_UINT
;
191 case GL_R16
: return ISL_FORMAT_R16_UNORM
;
192 case GL_R16_SNORM
: return ISL_FORMAT_R16_SNORM
;
193 case GL_R16F
: return ISL_FORMAT_R16_FLOAT
;
194 case GL_R16UI
: return ISL_FORMAT_R16_UINT
;
195 case GL_R16I
: return ISL_FORMAT_R16_SINT
;
196 case GL_RG16
: return ISL_FORMAT_R16G16_UNORM
;
197 case GL_RG16_SNORM
: return ISL_FORMAT_R16G16_SNORM
;
198 case GL_RG16F
: return ISL_FORMAT_R16G16_FLOAT
;
199 case GL_RG16UI
: return ISL_FORMAT_R16G16_UINT
;
200 case GL_RG16I
: return ISL_FORMAT_R16G16_SINT
;
201 case GL_RGBA16
: return ISL_FORMAT_R16G16B16A16_UNORM
;
202 case GL_RGBA16_SNORM
: return ISL_FORMAT_R16G16B16A16_SNORM
;
203 case GL_RGBA16F
: return ISL_FORMAT_R16G16B16A16_FLOAT
;
204 case GL_RGBA16UI
: return ISL_FORMAT_R16G16B16A16_UINT
;
205 case GL_RGBA16I
: return ISL_FORMAT_R16G16B16A16_SINT
;
206 case GL_R32F
: return ISL_FORMAT_R32_FLOAT
;
207 case GL_R32UI
: return ISL_FORMAT_R32_UINT
;
208 case GL_R32I
: return ISL_FORMAT_R32_SINT
;
209 case GL_RG32F
: return ISL_FORMAT_R32G32_FLOAT
;
210 case GL_RG32UI
: return ISL_FORMAT_R32G32_UINT
;
211 case GL_RG32I
: return ISL_FORMAT_R32G32_SINT
;
212 case GL_RGBA32F
: return ISL_FORMAT_R32G32B32A32_FLOAT
;
213 case GL_RGBA32UI
: return ISL_FORMAT_R32G32B32A32_UINT
;
214 case GL_RGBA32I
: return ISL_FORMAT_R32G32B32A32_SINT
;
215 case GL_NONE
: return ISL_FORMAT_UNSUPPORTED
;
217 assert(!"Invalid image format");
218 return ISL_FORMAT_UNSUPPORTED
;
223 * Simple 4-tuple of scalars used to pass around per-color component
227 color_u(unsigned x
= 0) : r(x
), g(x
), b(x
), a(x
)
231 color_u(unsigned r
, unsigned g
, unsigned b
, unsigned a
) :
232 r(r
), g(g
), b(b
), a(a
)
237 operator[](unsigned i
) const
239 const unsigned xs
[] = { r
, g
, b
, a
};
247 * Return the per-channel bitfield widths for a given image format.
250 get_bit_widths(isl_format format
)
252 const isl_format_layout
*fmtl
= isl_format_get_layout(format
);
254 return color_u(fmtl
->channels
.r
.bits
,
255 fmtl
->channels
.g
.bits
,
256 fmtl
->channels
.b
.bits
,
257 fmtl
->channels
.a
.bits
);
261 * Return the per-channel bitfield shifts for a given image format.
264 get_bit_shifts(isl_format format
)
266 const color_u widths
= get_bit_widths(format
);
267 return color_u(0, widths
.r
, widths
.r
+ widths
.g
,
268 widths
.r
+ widths
.g
+ widths
.b
);
272 * Return true if all present components have the same bit width.
275 is_homogeneous(isl_format format
)
277 const color_u widths
= get_bit_widths(format
);
278 return ((widths
.g
== 0 || widths
.g
== widths
.r
) &&
279 (widths
.b
== 0 || widths
.b
== widths
.r
) &&
280 (widths
.a
== 0 || widths
.a
== widths
.r
));
284 * Return true if the format conversion boils down to a trivial copy.
287 is_conversion_trivial(const gen_device_info
*devinfo
, isl_format format
)
289 return (get_bit_widths(format
).r
== 32 && is_homogeneous(format
)) ||
290 format
== isl_lower_storage_image_format(devinfo
, format
);
294 * Return true if the hardware natively supports some format with
295 * compatible bitfield layout, but possibly different data types.
298 has_supported_bit_layout(const gen_device_info
*devinfo
,
301 const color_u widths
= get_bit_widths(format
);
302 const color_u lower_widths
= get_bit_widths(
303 isl_lower_storage_image_format(devinfo
, format
));
305 return (widths
.r
== lower_widths
.r
&&
306 widths
.g
== lower_widths
.g
&&
307 widths
.b
== lower_widths
.b
&&
308 widths
.a
== lower_widths
.a
);
312 * Return true if we are required to spread individual components over
313 * several components of the format used by the hardware (RG32 and
314 * friends implemented as RGBA16UI).
317 has_split_bit_layout(const gen_device_info
*devinfo
, isl_format format
)
319 const isl_format lower_format
=
320 isl_lower_storage_image_format(devinfo
, format
);
322 return (isl_format_get_num_channels(format
) <
323 isl_format_get_num_channels(lower_format
));
327 * Return true if the hardware returns garbage in the unused high bits
328 * of each component. This may happen on IVB because we rely on the
329 * undocumented behavior that typed reads from surfaces of the
330 * unsupported R8 and R16 formats return useful data in their least
334 has_undefined_high_bits(const gen_device_info
*devinfo
,
337 const isl_format lower_format
=
338 isl_lower_storage_image_format(devinfo
, format
);
340 return (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
341 (lower_format
== ISL_FORMAT_R16_UINT
||
342 lower_format
== ISL_FORMAT_R8_UINT
));
346 * Return true if the format represents values as signed integers
347 * requiring sign extension when unpacking.
350 needs_sign_extension(isl_format format
)
352 return isl_format_has_snorm_channel(format
) ||
353 isl_format_has_sint_channel(format
);
357 namespace image_validity
{
359 * Check whether the bound image is suitable for untyped access.
362 emit_untyped_image_check(const fs_builder
&bld
, const fs_reg
&image
,
365 const gen_device_info
*devinfo
= bld
.shader
->devinfo
;
366 const fs_reg stride
= offset(image
, bld
, BRW_IMAGE_PARAM_STRIDE_OFFSET
);
368 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
369 /* Check whether the first stride component (i.e. the Bpp value)
370 * is greater than four, what on Gen7 indicates that a surface of
371 * type RAW has been bound for untyped access. Reading or writing
372 * to a surface of type other than RAW using untyped surface
373 * messages causes a hang on IVB and VLV.
376 bld
.CMP(bld
.null_reg_ud(), stride
, brw_imm_d(4),
379 return BRW_PREDICATE_NORMAL
;
381 /* More recent generations handle the format mismatch
389 * Check whether there is an image bound at the given index and write
390 * the comparison result to f0.0. Returns an appropriate predication
391 * mode to use on subsequent image operations.
394 emit_typed_atomic_check(const fs_builder
&bld
, const fs_reg
&image
)
396 const gen_device_info
*devinfo
= bld
.shader
->devinfo
;
397 const fs_reg size
= offset(image
, bld
, BRW_IMAGE_PARAM_SIZE_OFFSET
);
399 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
400 /* Check the first component of the size field to find out if the
401 * image is bound. Necessary on IVB for typed atomics because
402 * they don't seem to respect null surfaces and will happily
403 * corrupt or read random memory when no image is bound.
405 bld
.CMP(bld
.null_reg_ud(),
406 retype(size
, BRW_REGISTER_TYPE_UD
),
407 brw_imm_d(0), BRW_CONDITIONAL_NZ
);
409 return BRW_PREDICATE_NORMAL
;
411 /* More recent platforms implement compliant behavior when a null
414 return BRW_PREDICATE_NONE
;
419 * Check whether the provided coordinates are within the image bounds
420 * and write the comparison result to f0.0. Returns an appropriate
421 * predication mode to use on subsequent image operations.
424 emit_bounds_check(const fs_builder
&bld
, const fs_reg
&image
,
425 const fs_reg
&addr
, unsigned dims
)
427 const fs_reg size
= offset(image
, bld
, BRW_IMAGE_PARAM_SIZE_OFFSET
);
429 for (unsigned c
= 0; c
< dims
; ++c
)
430 set_predicate(c
== 0 ? BRW_PREDICATE_NONE
: BRW_PREDICATE_NORMAL
,
431 bld
.CMP(bld
.null_reg_ud(),
432 offset(retype(addr
, BRW_REGISTER_TYPE_UD
), bld
, c
),
433 offset(size
, bld
, c
),
436 return BRW_PREDICATE_NORMAL
;
440 namespace image_coordinates
{
442 * Return the total number of coordinates needed to address a texel of
443 * the surface, which may be more than the sum of \p surf_dims and \p
444 * arr_dims if padding is required.
447 num_image_coordinates(const fs_builder
&bld
,
448 unsigned surf_dims
, unsigned arr_dims
,
451 /* HSW in vec4 mode and our software coordinate handling for untyped
452 * reads want the array index to be at the Z component.
454 const bool array_index_at_z
=
455 format
!= ISL_FORMAT_UNSUPPORTED
&&
456 !isl_has_matching_typed_storage_image_format(
457 bld
.shader
->devinfo
, format
);
458 const unsigned zero_dims
=
459 ((surf_dims
== 1 && arr_dims
== 1 && array_index_at_z
) ? 1 : 0);
461 return surf_dims
+ zero_dims
+ arr_dims
;
465 * Transform image coordinates into the form expected by the
469 emit_image_coordinates(const fs_builder
&bld
, const fs_reg
&addr
,
470 unsigned surf_dims
, unsigned arr_dims
,
473 const unsigned dims
=
474 num_image_coordinates(bld
, surf_dims
, arr_dims
, format
);
476 if (dims
> surf_dims
+ arr_dims
) {
477 assert(surf_dims
== 1 && arr_dims
== 1 && dims
== 3);
478 /* The array index is required to be passed in as the Z component,
479 * insert a zero at the Y component to shift it to the right
482 * FINISHME: Factor out this frequently recurring pattern into a
485 const fs_reg srcs
[] = { addr
, brw_imm_d(0), offset(addr
, bld
, 1) };
486 const fs_reg dst
= bld
.vgrf(addr
.type
, dims
);
487 bld
.LOAD_PAYLOAD(dst
, srcs
, dims
, 0);
495 * Calculate the offset in memory of the texel given by \p coord.
497 * This is meant to be used with untyped surface messages to access a
498 * tiled surface, what involves taking into account the tiling and
499 * swizzling modes of the surface manually so it will hopefully not
502 * The tiling algorithm implemented here matches either the X or Y
503 * tiling layouts supported by the hardware depending on the tiling
504 * coefficients passed to the program as uniforms. See Volume 1 Part 2
505 * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
506 * explanation of the hardware tiling format.
509 emit_address_calculation(const fs_builder
&bld
, const fs_reg
&image
,
510 const fs_reg
&coord
, unsigned dims
)
512 const gen_device_info
*devinfo
= bld
.shader
->devinfo
;
513 const fs_reg off
= offset(image
, bld
, BRW_IMAGE_PARAM_OFFSET_OFFSET
);
514 const fs_reg stride
= offset(image
, bld
, BRW_IMAGE_PARAM_STRIDE_OFFSET
);
515 const fs_reg tile
= offset(image
, bld
, BRW_IMAGE_PARAM_TILING_OFFSET
);
516 const fs_reg swz
= offset(image
, bld
, BRW_IMAGE_PARAM_SWIZZLING_OFFSET
);
517 const fs_reg addr
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
518 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
519 const fs_reg minor
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
520 const fs_reg major
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
521 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
);
523 /* Shift the coordinates by the fixed surface offset. It may be
524 * non-zero if the image is a single slice of a higher-dimensional
525 * surface, or if a non-zero mipmap level of the surface is bound to
526 * the pipeline. The offset needs to be applied here rather than at
527 * surface state set-up time because the desired slice-level may
528 * start mid-tile, so simply shifting the surface base address
529 * wouldn't give a well-formed tiled surface in the general case.
531 for (unsigned c
= 0; c
< 2; ++c
)
532 bld
.ADD(offset(addr
, bld
, c
), offset(off
, bld
, c
),
534 offset(retype(coord
, BRW_REGISTER_TYPE_UD
), bld
, c
) :
535 fs_reg(brw_imm_d(0))));
537 /* The layout of 3-D textures in memory is sort-of like a tiling
538 * format. At each miplevel, the slices are arranged in rows of
539 * 2^level slices per row. The slice row is stored in tmp.y and
540 * the slice within the row is stored in tmp.x.
542 * The layout of 2-D array textures and cubemaps is much simpler:
543 * Depending on whether the ARYSPC_LOD0 layout is in use it will be
544 * stored in memory as an array of slices, each one being a 2-D
545 * arrangement of miplevels, or as a 2D arrangement of miplevels,
546 * each one being an array of slices. In either case the separation
547 * between slices of the same LOD is equal to the qpitch value
548 * provided as stride.w.
550 * This code can be made to handle either 2D arrays and 3D textures
551 * by passing in the miplevel as tile.z for 3-D textures and 0 in
552 * tile.z for 2-D array textures.
554 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
555 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
556 * of the hardware 3D texture and 2D array layouts.
559 /* Decompose z into a major (tmp.y) and a minor (tmp.x)
562 bld
.BFE(offset(tmp
, bld
, 0), offset(tile
, bld
, 2), brw_imm_d(0),
563 offset(retype(coord
, BRW_REGISTER_TYPE_UD
), bld
, 2));
564 bld
.SHR(offset(tmp
, bld
, 1),
565 offset(retype(coord
, BRW_REGISTER_TYPE_UD
), bld
, 2),
566 offset(tile
, bld
, 2));
568 /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
571 for (unsigned c
= 0; c
< 2; ++c
) {
572 bld
.MUL(offset(tmp
, bld
, c
),
573 offset(stride
, bld
, 2 + c
), offset(tmp
, bld
, c
));
574 bld
.ADD(offset(addr
, bld
, c
),
575 offset(addr
, bld
, c
), offset(tmp
, bld
, c
));
580 /* Calculate the major/minor x and y indices. In order to
581 * accommodate both X and Y tiling, the Y-major tiling format is
582 * treated as being a bunch of narrow X-tiles placed next to each
583 * other. This means that the tile width for Y-tiling is actually
584 * the width of one sub-column of the Y-major tile where each 4K
585 * tile has 8 512B sub-columns.
587 * The major Y value is the row of tiles in which the pixel lives.
588 * The major X value is the tile sub-column in which the pixel
589 * lives; for X tiling, this is the same as the tile column, for Y
590 * tiling, each tile has 8 sub-columns. The minor X and Y indices
591 * are the position within the sub-column.
593 for (unsigned c
= 0; c
< 2; ++c
) {
594 /* Calculate the minor x and y indices. */
595 bld
.BFE(offset(minor
, bld
, c
), offset(tile
, bld
, c
),
596 brw_imm_d(0), offset(addr
, bld
, c
));
598 /* Calculate the major x and y indices. */
599 bld
.SHR(offset(major
, bld
, c
),
600 offset(addr
, bld
, c
), offset(tile
, bld
, c
));
603 /* Calculate the texel index from the start of the tile row and
604 * the vertical coordinate of the row.
606 * tmp.x = (major.x << tile.y << tile.x) +
607 * (minor.y << tile.x) + minor.x
608 * tmp.y = major.y << tile.y
610 bld
.SHL(tmp
, major
, offset(tile
, bld
, 1));
611 bld
.ADD(tmp
, tmp
, offset(minor
, bld
, 1));
612 bld
.SHL(tmp
, tmp
, offset(tile
, bld
, 0));
613 bld
.ADD(tmp
, tmp
, minor
);
614 bld
.SHL(offset(tmp
, bld
, 1),
615 offset(major
, bld
, 1), offset(tile
, bld
, 1));
617 /* Add it to the start of the tile row. */
618 bld
.MUL(offset(tmp
, bld
, 1),
619 offset(tmp
, bld
, 1), offset(stride
, bld
, 1));
620 bld
.ADD(tmp
, tmp
, offset(tmp
, bld
, 1));
622 /* Multiply by the Bpp value. */
623 bld
.MUL(dst
, tmp
, stride
);
625 if (devinfo
->gen
< 8 && !devinfo
->is_baytrail
) {
626 /* Take into account the two dynamically specified shifts.
627 * Both need are used to implement swizzling of X-tiled
628 * surfaces. For Y-tiled surfaces only one bit needs to be
629 * XOR-ed with bit 6 of the memory address, so a swz value of
630 * 0xff (actually interpreted as 31 by the hardware) will be
631 * provided to cause the relevant bit of tmp.y to be zero and
632 * turn the first XOR into the identity. For linear surfaces
633 * or platforms lacking address swizzling both shifts will be
634 * 0xff causing the relevant bits of both tmp.x and .y to be
635 * zero, what effectively disables swizzling.
637 for (unsigned c
= 0; c
< 2; ++c
)
638 bld
.SHR(offset(tmp
, bld
, c
), dst
, offset(swz
, bld
, c
));
640 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
641 bld
.XOR(tmp
, tmp
, offset(tmp
, bld
, 1));
642 bld
.AND(tmp
, tmp
, brw_imm_d(1 << 6));
643 bld
.XOR(dst
, dst
, tmp
);
647 /* Multiply by the Bpp/stride value. Note that the addr.y may be
648 * non-zero even if the image is one-dimensional because a
649 * vertical offset may have been applied above to select a
650 * non-zero slice or level of a higher-dimensional texture.
652 bld
.MUL(offset(addr
, bld
, 1),
653 offset(addr
, bld
, 1), offset(stride
, bld
, 1));
654 bld
.ADD(addr
, addr
, offset(addr
, bld
, 1));
655 bld
.MUL(dst
, addr
, stride
);
662 namespace image_format_conversion
{
663 using image_format_info::color_u
;
667 * Maximum representable value in an unsigned integer with the given
678 * Pack the vector \p src in a bitfield given the per-component bit
679 * shifts and widths. Note that bitfield components are not allowed to
680 * cross 32-bit boundaries.
683 emit_pack(const fs_builder
&bld
, const fs_reg
&src
,
684 const color_u
&shifts
, const color_u
&widths
)
686 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 4);
689 for (unsigned c
= 0; c
< 4; ++c
) {
691 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
);
693 /* Shift each component left to the correct bitfield position. */
694 bld
.SHL(tmp
, offset(src
, bld
, c
), brw_imm_ud(shifts
[c
] % 32));
696 /* Add everything up. */
697 if (seen
[shifts
[c
] / 32]) {
698 bld
.OR(offset(dst
, bld
, shifts
[c
] / 32),
699 offset(dst
, bld
, shifts
[c
] / 32), tmp
);
701 bld
.MOV(offset(dst
, bld
, shifts
[c
] / 32), tmp
);
702 seen
[shifts
[c
] / 32] = true;
711 * Unpack a vector from the bitfield \p src given the per-component bit
712 * shifts and widths. Note that bitfield components are not allowed to
713 * cross 32-bit boundaries.
716 emit_unpack(const fs_builder
&bld
, const fs_reg
&src
,
717 const color_u
&shifts
, const color_u
&widths
)
719 const fs_reg dst
= bld
.vgrf(src
.type
, 4);
721 for (unsigned c
= 0; c
< 4; ++c
) {
723 /* Shift left to discard the most significant bits. */
724 bld
.SHL(offset(dst
, bld
, c
),
725 offset(src
, bld
, shifts
[c
] / 32),
726 brw_imm_ud(32 - shifts
[c
] % 32 - widths
[c
]));
728 /* Shift back to the least significant bits using an arithmetic
729 * shift to get sign extension on signed types.
731 bld
.ASR(offset(dst
, bld
, c
),
732 offset(dst
, bld
, c
), brw_imm_ud(32 - widths
[c
]));
740 * Convert an integer vector into another integer vector of the
741 * specified bit widths, properly handling overflow.
744 emit_convert_to_integer(const fs_builder
&bld
, const fs_reg
&src
,
745 const color_u
&widths
, bool is_signed
)
747 const unsigned s
= (is_signed
? 1 : 0);
748 const fs_reg dst
= bld
.vgrf(
749 is_signed
? BRW_REGISTER_TYPE_D
: BRW_REGISTER_TYPE_UD
, 4);
750 assert(src
.type
== dst
.type
);
752 for (unsigned c
= 0; c
< 4; ++c
) {
754 /* Clamp to the maximum value. */
755 bld
.emit_minmax(offset(dst
, bld
, c
), offset(src
, bld
, c
),
756 brw_imm_d((int)scale(widths
[c
] - s
)),
759 /* Clamp to the minimum value. */
761 bld
.emit_minmax(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
762 brw_imm_d(-(int)scale(widths
[c
] - s
) - 1),
765 /* Mask off all but the bits we actually want. Otherwise, if
766 * we pass a negative number into the hardware when it's
767 * expecting something like UINT8, it will happily clamp it to
770 if (is_signed
&& widths
[c
] < 32)
771 bld
.AND(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
772 brw_imm_d(scale(widths
[c
])));
780 * Convert a normalized fixed-point vector of the specified signedness
781 * and bit widths into a floating point vector.
784 emit_convert_from_scaled(const fs_builder
&bld
, const fs_reg
&src
,
785 const color_u
&widths
, bool is_signed
)
787 const unsigned s
= (is_signed
? 1 : 0);
788 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_F
, 4);
790 for (unsigned c
= 0; c
< 4; ++c
) {
792 /* Convert to float. */
793 bld
.MOV(offset(dst
, bld
, c
), offset(src
, bld
, c
));
795 /* Divide by the normalization constants. */
796 bld
.MUL(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
797 brw_imm_f(1.0f
/ scale(widths
[c
] - s
)));
799 /* Clamp to the minimum value. */
801 bld
.emit_minmax(offset(dst
, bld
, c
),
802 offset(dst
, bld
, c
), brw_imm_f(-1.0f
),
810 * Convert a floating-point vector into a normalized fixed-point vector
811 * of the specified signedness and bit widths.
814 emit_convert_to_scaled(const fs_builder
&bld
, const fs_reg
&src
,
815 const color_u
&widths
, bool is_signed
)
817 const unsigned s
= (is_signed
? 1 : 0);
818 const fs_reg dst
= bld
.vgrf(
819 is_signed
? BRW_REGISTER_TYPE_D
: BRW_REGISTER_TYPE_UD
, 4);
820 const fs_reg fdst
= retype(dst
, BRW_REGISTER_TYPE_F
);
822 for (unsigned c
= 0; c
< 4; ++c
) {
824 /* Clamp the normalized floating-point argument. */
826 bld
.emit_minmax(offset(fdst
, bld
, c
), offset(src
, bld
, c
),
827 brw_imm_f(-1.0f
), BRW_CONDITIONAL_GE
);
829 bld
.emit_minmax(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
),
830 brw_imm_f(1.0f
), BRW_CONDITIONAL_L
);
832 set_saturate(true, bld
.MOV(offset(fdst
, bld
, c
),
833 offset(src
, bld
, c
)));
836 /* Multiply by the normalization constants. */
837 bld
.MUL(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
),
838 brw_imm_f((float)scale(widths
[c
] - s
)));
840 /* Convert to integer. */
841 bld
.RNDE(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
));
842 bld
.MOV(offset(dst
, bld
, c
), offset(fdst
, bld
, c
));
844 /* Mask off all but the bits we actually want. Otherwise, if
845 * we pass a negative number into the hardware when it's
846 * expecting something like UINT8, it will happily clamp it to
849 if (is_signed
&& widths
[c
] < 32)
850 bld
.AND(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
851 brw_imm_d(scale(widths
[c
])));
859 * Convert a floating point vector of the specified bit widths into a
860 * 32-bit floating point vector.
863 emit_convert_from_float(const fs_builder
&bld
, const fs_reg
&src
,
864 const color_u
&widths
)
866 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 4);
867 const fs_reg fdst
= retype(dst
, BRW_REGISTER_TYPE_F
);
869 for (unsigned c
= 0; c
< 4; ++c
) {
871 bld
.MOV(offset(dst
, bld
, c
), offset(src
, bld
, c
));
873 /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
874 * This works because they have a 5-bit exponent just like the
875 * 16-bit floating point format, and they have no sign bit.
878 bld
.SHL(offset(dst
, bld
, c
),
879 offset(dst
, bld
, c
), brw_imm_ud(15 - widths
[c
]));
881 /* Convert to 32-bit floating point. */
882 bld
.F16TO32(offset(fdst
, bld
, c
), offset(dst
, bld
, c
));
890 * Convert a vector into a floating point vector of the specified bit
894 emit_convert_to_float(const fs_builder
&bld
, const fs_reg
&src
,
895 const color_u
&widths
)
897 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 4);
898 const fs_reg fdst
= retype(dst
, BRW_REGISTER_TYPE_F
);
900 for (unsigned c
= 0; c
< 4; ++c
) {
902 bld
.MOV(offset(fdst
, bld
, c
), offset(src
, bld
, c
));
904 /* Clamp to the minimum value. */
906 bld
.emit_minmax(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
),
907 brw_imm_f(0.0f
), BRW_CONDITIONAL_GE
);
909 /* Convert to 16-bit floating-point. */
910 bld
.F32TO16(offset(dst
, bld
, c
), offset(fdst
, bld
, c
));
912 /* Discard the least significant bits to get floating point
913 * numbers of the requested width. This works because the
914 * 10-bit and 11-bit floating point formats have a 5-bit
915 * exponent just like the 16-bit format, and they have no sign
919 bld
.SHR(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
920 brw_imm_ud(15 - widths
[c
]));
928 * Fill missing components of a vector with 0, 0, 0, 1.
931 emit_pad(const fs_builder
&bld
, const fs_reg
&src
,
932 const color_u
&widths
)
934 const fs_reg dst
= bld
.vgrf(src
.type
, 4);
935 const unsigned pad
[] = { 0, 0, 0, 1 };
937 for (unsigned c
= 0; c
< 4; ++c
)
938 bld
.MOV(offset(dst
, bld
, c
),
939 widths
[c
] ? offset(src
, bld
, c
)
940 : fs_reg(brw_imm_ud(pad
[c
])));
948 namespace image_access
{
950 * Load a vector from a surface of the given format and dimensionality
951 * at the given coordinates. \p surf_dims and \p arr_dims give the
952 * number of non-array and array coordinates of the image respectively.
955 emit_image_load(const fs_builder
&bld
,
956 const fs_reg
&image
, const fs_reg
&addr
,
957 unsigned surf_dims
, unsigned arr_dims
,
960 using namespace image_format_info
;
961 using namespace image_format_conversion
;
962 using namespace image_validity
;
963 using namespace image_coordinates
;
964 using namespace surface_access
;
965 const gen_device_info
*devinfo
= bld
.shader
->devinfo
;
966 const isl_format format
= isl_format_for_gl_format(gl_format
);
967 const isl_format lower_format
=
968 isl_lower_storage_image_format(devinfo
, format
);
971 /* Transform the image coordinates into actual surface coordinates. */
973 emit_image_coordinates(bld
, addr
, surf_dims
, arr_dims
, format
);
974 const unsigned dims
=
975 num_image_coordinates(bld
, surf_dims
, arr_dims
, format
);
977 if (isl_has_matching_typed_storage_image_format(devinfo
, format
)) {
978 /* Hopefully we get here most of the time... */
979 tmp
= emit_typed_read(bld
, image
, saddr
, dims
,
980 isl_format_get_num_channels(lower_format
));
982 /* Untyped surface reads return 32 bits of the surface per
983 * component, without any sort of unpacking or type conversion,
985 const unsigned size
= isl_format_get_layout(format
)->bpb
/ 32;
986 /* they don't properly handle out of bounds access, so we have to
987 * check manually if the coordinates are valid and predicate the
988 * surface read on the result,
990 const brw_predicate pred
=
991 emit_untyped_image_check(bld
, image
,
992 emit_bounds_check(bld
, image
,
995 /* and they don't know about surface coordinates, we need to
996 * convert them to a raw memory offset.
998 const fs_reg laddr
= emit_address_calculation(bld
, image
, saddr
, dims
);
1000 tmp
= emit_untyped_read(bld
, image
, laddr
, 1, size
, pred
);
1002 /* An out of bounds surface access should give zero as result. */
1003 for (unsigned c
= 0; c
< size
; ++c
)
1004 set_predicate(pred
, bld
.SEL(offset(tmp
, bld
, c
),
1005 offset(tmp
, bld
, c
), brw_imm_d(0)));
1008 /* Set the register type to D instead of UD if the data type is
1009 * represented as a signed integer in memory so that sign extension
1010 * is handled correctly by unpack.
1012 if (needs_sign_extension(format
))
1013 tmp
= retype(tmp
, BRW_REGISTER_TYPE_D
);
1015 if (!has_supported_bit_layout(devinfo
, format
)) {
1016 /* Unpack individual vector components from the bitfield if the
1017 * hardware is unable to do it for us.
1019 if (has_split_bit_layout(devinfo
, format
))
1020 tmp
= emit_pack(bld
, tmp
, get_bit_shifts(lower_format
),
1021 get_bit_widths(lower_format
));
1023 tmp
= emit_unpack(bld
, tmp
, get_bit_shifts(format
),
1024 get_bit_widths(format
));
1026 } else if ((needs_sign_extension(format
) &&
1027 !is_conversion_trivial(devinfo
, format
)) ||
1028 has_undefined_high_bits(devinfo
, format
)) {
1029 /* Perform a trivial unpack even though the bit layout matches in
1030 * order to get the most significant bits of each component
1031 * initialized properly.
1033 tmp
= emit_unpack(bld
, tmp
, color_u(0, 32, 64, 96),
1034 get_bit_widths(format
));
1037 if (!isl_format_has_int_channel(format
)) {
1038 if (is_conversion_trivial(devinfo
, format
)) {
1039 /* Just need to cast the vector to the target type. */
1040 tmp
= retype(tmp
, BRW_REGISTER_TYPE_F
);
1042 /* Do the right sort of type conversion to float. */
1043 if (isl_format_has_float_channel(format
))
1044 tmp
= emit_convert_from_float(
1045 bld
, tmp
, get_bit_widths(format
));
1047 tmp
= emit_convert_from_scaled(
1048 bld
, tmp
, get_bit_widths(format
),
1049 isl_format_has_snorm_channel(format
));
1053 /* Initialize missing components of the result. */
1054 return emit_pad(bld
, tmp
, get_bit_widths(format
));
1058 * Store a vector in a surface of the given format and dimensionality at
1059 * the given coordinates. \p surf_dims and \p arr_dims give the number
1060 * of non-array and array coordinates of the image respectively.
1063 emit_image_store(const fs_builder
&bld
, const fs_reg
&image
,
1064 const fs_reg
&addr
, const fs_reg
&src
,
1065 unsigned surf_dims
, unsigned arr_dims
,
1068 using namespace image_format_info
;
1069 using namespace image_format_conversion
;
1070 using namespace image_validity
;
1071 using namespace image_coordinates
;
1072 using namespace surface_access
;
1073 const isl_format format
= isl_format_for_gl_format(gl_format
);
1074 const gen_device_info
*devinfo
= bld
.shader
->devinfo
;
1076 /* Transform the image coordinates into actual surface coordinates. */
1077 const fs_reg saddr
=
1078 emit_image_coordinates(bld
, addr
, surf_dims
, arr_dims
, format
);
1079 const unsigned dims
=
1080 num_image_coordinates(bld
, surf_dims
, arr_dims
, format
);
1082 if (gl_format
== GL_NONE
) {
1083 /* We don't know what the format is, but that's fine because it
1084 * implies write-only access, and typed surface writes are always
1085 * able to take care of type conversion and packing for us.
1087 emit_typed_write(bld
, image
, saddr
, src
, dims
, 4);
1090 const isl_format lower_format
=
1091 isl_lower_storage_image_format(devinfo
, format
);
1094 if (!is_conversion_trivial(devinfo
, format
)) {
1095 /* Do the right sort of type conversion. */
1096 if (isl_format_has_float_channel(format
))
1097 tmp
= emit_convert_to_float(bld
, tmp
, get_bit_widths(format
));
1099 else if (isl_format_has_int_channel(format
))
1100 tmp
= emit_convert_to_integer(bld
, tmp
, get_bit_widths(format
),
1101 isl_format_has_sint_channel(format
));
1104 tmp
= emit_convert_to_scaled(bld
, tmp
, get_bit_widths(format
),
1105 isl_format_has_snorm_channel(format
));
1108 /* We're down to bit manipulation at this point. */
1109 tmp
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
1111 if (!has_supported_bit_layout(devinfo
, format
)) {
1112 /* Pack the vector components into a bitfield if the hardware
1113 * is unable to do it for us.
1115 if (has_split_bit_layout(devinfo
, format
))
1116 tmp
= emit_unpack(bld
, tmp
, get_bit_shifts(lower_format
),
1117 get_bit_widths(lower_format
));
1120 tmp
= emit_pack(bld
, tmp
, get_bit_shifts(format
),
1121 get_bit_widths(format
));
1124 if (isl_has_matching_typed_storage_image_format(devinfo
, format
)) {
1125 /* Hopefully we get here most of the time... */
1126 emit_typed_write(bld
, image
, saddr
, tmp
, dims
,
1127 isl_format_get_num_channels(lower_format
));
1130 /* Untyped surface writes store 32 bits of the surface per
1131 * component, without any sort of packing or type conversion,
1133 const unsigned size
= isl_format_get_layout(format
)->bpb
/ 32;
1135 /* they don't properly handle out of bounds access, so we have
1136 * to check manually if the coordinates are valid and predicate
1137 * the surface write on the result,
1139 const brw_predicate pred
=
1140 emit_untyped_image_check(bld
, image
,
1141 emit_bounds_check(bld
, image
,
1144 /* and, phew, they don't know about surface coordinates, we
1145 * need to convert them to a raw memory offset.
1147 const fs_reg laddr
= emit_address_calculation(
1148 bld
, image
, saddr
, dims
);
1150 emit_untyped_write(bld
, image
, laddr
, tmp
, 1, size
, pred
);
1156 * Perform an atomic read-modify-write operation in a surface of the
1157 * given dimensionality at the given coordinates. \p surf_dims and \p
1158 * arr_dims give the number of non-array and array coordinates of the
1159 * image respectively. Main building block of the imageAtomic GLSL
1163 emit_image_atomic(const fs_builder
&bld
,
1164 const fs_reg
&image
, const fs_reg
&addr
,
1165 const fs_reg
&src0
, const fs_reg
&src1
,
1166 unsigned surf_dims
, unsigned arr_dims
,
1167 unsigned rsize
, unsigned op
)
1169 using namespace image_validity
;
1170 using namespace image_coordinates
;
1171 using namespace surface_access
;
1172 /* Avoid performing an atomic operation on an unbound surface. */
1173 const brw_predicate pred
= emit_typed_atomic_check(bld
, image
);
1175 /* Transform the image coordinates into actual surface coordinates. */
1176 const fs_reg saddr
=
1177 emit_image_coordinates(bld
, addr
, surf_dims
, arr_dims
,
1178 ISL_FORMAT_R32_UINT
);
1179 const unsigned dims
=
1180 num_image_coordinates(bld
, surf_dims
, arr_dims
,
1181 ISL_FORMAT_R32_UINT
);
1183 /* Thankfully we can do without untyped atomics here. */
1184 const fs_reg tmp
= emit_typed_atomic(bld
, image
, saddr
, src0
, src1
,
1185 dims
, rsize
, op
, pred
);
1187 /* An unbound surface access should give zero as result. */
1189 set_predicate(pred
, bld
.SEL(tmp
, tmp
, brw_imm_d(0)));
1191 return retype(tmp
, src0
.type
);