2 * Copyright © 2013-2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "brw_fs_surface_builder.h"
30 namespace surface_access
{
33 * Generate a logical send opcode for a surface message and return
37 emit_send(const fs_builder
&bld
, enum opcode opcode
,
38 const fs_reg
&addr
, const fs_reg
&src
, const fs_reg
&surface
,
39 unsigned dims
, unsigned arg
, unsigned rsize
,
40 brw_predicate pred
= BRW_PREDICATE_NONE
)
42 /* Reduce the dynamically uniform surface index to a single
45 const fs_reg usurface
= bld
.emit_uniformize(surface
);
46 const fs_reg srcs
[] = {
47 addr
, src
, usurface
, fs_reg(dims
), fs_reg(arg
)
49 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, rsize
);
50 fs_inst
*inst
= bld
.emit(opcode
, dst
, srcs
, ARRAY_SIZE(srcs
));
52 inst
->regs_written
= rsize
* bld
.dispatch_width() / 8;
53 inst
->predicate
= pred
;
59 * Emit an untyped surface read opcode. \p dims determines the number
60 * of components of the address and \p size the number of components of
64 emit_untyped_read(const fs_builder
&bld
,
65 const fs_reg
&surface
, const fs_reg
&addr
,
66 unsigned dims
, unsigned size
,
69 return emit_send(bld
, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL
,
70 addr
, fs_reg(), surface
, dims
, size
, size
, pred
);
74 * Emit an untyped surface write opcode. \p dims determines the number
75 * of components of the address and \p size the number of components of
79 emit_untyped_write(const fs_builder
&bld
, const fs_reg
&surface
,
80 const fs_reg
&addr
, const fs_reg
&src
,
81 unsigned dims
, unsigned size
,
84 emit_send(bld
, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL
,
85 addr
, src
, surface
, dims
, size
, 0, pred
);
89 * Emit an untyped surface atomic opcode. \p dims determines the number
90 * of components of the address and \p rsize the number of components of
91 * the returned value (either zero or one).
94 emit_untyped_atomic(const fs_builder
&bld
,
95 const fs_reg
&surface
, const fs_reg
&addr
,
96 const fs_reg
&src0
, const fs_reg
&src1
,
97 unsigned dims
, unsigned rsize
, unsigned op
,
100 /* FINISHME: Factor out this frequently recurring pattern into a
103 const unsigned n
= (src0
.file
!= BAD_FILE
) + (src1
.file
!= BAD_FILE
);
104 const fs_reg srcs
[] = { src0
, src1
};
105 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, n
);
106 bld
.LOAD_PAYLOAD(tmp
, srcs
, n
, 0);
108 return emit_send(bld
, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL
,
109 addr
, tmp
, surface
, dims
, op
, rsize
, pred
);
113 * Emit a typed surface read opcode. \p dims determines the number of
114 * components of the address and \p size the number of components of the
118 emit_typed_read(const fs_builder
&bld
, const fs_reg
&surface
,
119 const fs_reg
&addr
, unsigned dims
, unsigned size
)
121 return emit_send(bld
, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL
,
122 addr
, fs_reg(), surface
, dims
, size
, size
);
126 * Emit a typed surface write opcode. \p dims determines the number of
127 * components of the address and \p size the number of components of the
131 emit_typed_write(const fs_builder
&bld
, const fs_reg
&surface
,
132 const fs_reg
&addr
, const fs_reg
&src
,
133 unsigned dims
, unsigned size
)
135 emit_send(bld
, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL
,
136 addr
, src
, surface
, dims
, size
, 0);
140 * Emit a typed surface atomic opcode. \p dims determines the number of
141 * components of the address and \p rsize the number of components of
142 * the returned value (either zero or one).
145 emit_typed_atomic(const fs_builder
&bld
, const fs_reg
&surface
,
147 const fs_reg
&src0
, const fs_reg
&src1
,
148 unsigned dims
, unsigned rsize
, unsigned op
,
151 /* FINISHME: Factor out this frequently recurring pattern into a
154 const unsigned n
= (src0
.file
!= BAD_FILE
) + (src1
.file
!= BAD_FILE
);
155 const fs_reg srcs
[] = { src0
, src1
};
156 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, n
);
157 bld
.LOAD_PAYLOAD(tmp
, srcs
, n
, 0);
159 return emit_send(bld
, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL
,
160 addr
, tmp
, surface
, dims
, op
, rsize
);
166 namespace image_format_info
{
168 * Simple 4-tuple of scalars used to pass around per-color component
172 color_u(unsigned x
= 0) : r(x
), g(x
), b(x
), a(x
)
176 color_u(unsigned r
, unsigned g
, unsigned b
, unsigned a
) :
177 r(r
), g(g
), b(b
), a(a
)
182 operator[](unsigned i
) const
184 const unsigned xs
[] = { r
, g
, b
, a
};
192 * Return the per-channel bitfield widths for a given image format.
195 get_bit_widths(mesa_format format
)
197 return color_u(_mesa_get_format_bits(format
, GL_RED_BITS
),
198 _mesa_get_format_bits(format
, GL_GREEN_BITS
),
199 _mesa_get_format_bits(format
, GL_BLUE_BITS
),
200 _mesa_get_format_bits(format
, GL_ALPHA_BITS
));
204 * Return the per-channel bitfield shifts for a given image format.
207 get_bit_shifts(mesa_format format
)
209 const color_u widths
= get_bit_widths(format
);
210 return color_u(0, widths
.r
, widths
.r
+ widths
.g
,
211 widths
.r
+ widths
.g
+ widths
.b
);
215 * Return true if all present components have the same bit width.
218 is_homogeneous(mesa_format format
)
220 const color_u widths
= get_bit_widths(format
);
221 return ((widths
.g
== 0 || widths
.g
== widths
.r
) &&
222 (widths
.b
== 0 || widths
.b
== widths
.r
) &&
223 (widths
.a
== 0 || widths
.a
== widths
.r
));
227 * Return true if the format conversion boils down to a trivial copy.
230 is_conversion_trivial(const brw_device_info
*devinfo
, mesa_format format
)
232 return (get_bit_widths(format
).r
== 32 && is_homogeneous(format
)) ||
233 format
== brw_lower_mesa_image_format(devinfo
, format
);
237 * Return true if the hardware natively supports some format with
238 * compatible bitfield layout, but possibly different data types.
241 has_supported_bit_layout(const brw_device_info
*devinfo
,
244 const color_u widths
= get_bit_widths(format
);
245 const color_u lower_widths
= get_bit_widths(
246 brw_lower_mesa_image_format(devinfo
, format
));
248 return (widths
.r
== lower_widths
.r
&&
249 widths
.g
== lower_widths
.g
&&
250 widths
.b
== lower_widths
.b
&&
251 widths
.a
== lower_widths
.a
);
255 * Return true if we are required to spread individual components over
256 * several components of the format used by the hardware (RG32 and
257 * friends implemented as RGBA16UI).
260 has_split_bit_layout(const brw_device_info
*devinfo
, mesa_format format
)
262 const mesa_format lower_format
=
263 brw_lower_mesa_image_format(devinfo
, format
);
265 return (_mesa_format_num_components(format
) <
266 _mesa_format_num_components(lower_format
));
270 * Return true unless we have to fall back to untyped surface access.
274 has_matching_typed_format(const brw_device_info
*devinfo
,
277 return (_mesa_get_format_bytes(format
) <= 4 ||
278 (_mesa_get_format_bytes(format
) <= 8 &&
279 (devinfo
->gen
>= 8 || devinfo
->is_haswell
)) ||
284 * Return true if the hardware returns garbage in the unused high bits
285 * of each component. This may happen on IVB because we rely on the
286 * undocumented behavior that typed reads from surfaces of the
287 * unsupported R8 and R16 formats return useful data in their least
291 has_undefined_high_bits(const brw_device_info
*devinfo
,
294 const mesa_format lower_format
=
295 brw_lower_mesa_image_format(devinfo
, format
);
297 return (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
298 (lower_format
== MESA_FORMAT_R_UINT16
||
299 lower_format
== MESA_FORMAT_R_UINT8
));
303 * Return true if the format represents values as signed integers
304 * requiring sign extension when unpacking.
307 needs_sign_extension(mesa_format format
)
309 return (_mesa_get_format_datatype(format
) == GL_SIGNED_NORMALIZED
||
310 _mesa_get_format_datatype(format
) == GL_INT
);
314 namespace image_validity
{
316 * Check whether there is an image bound at the given index and write
317 * the comparison result to f0.0. Returns an appropriate predication
318 * mode to use on subsequent image operations.
321 emit_surface_check(const fs_builder
&bld
, const fs_reg
&image
)
323 const brw_device_info
*devinfo
= bld
.shader
->devinfo
;
324 const fs_reg size
= offset(image
, bld
, BRW_IMAGE_PARAM_SIZE_OFFSET
);
326 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
327 /* Check the first component of the size field to find out if the
328 * image is bound. Necessary on IVB for typed atomics because
329 * they don't seem to respect null surfaces and will happily
330 * corrupt or read random memory when no image is bound.
332 bld
.CMP(bld
.null_reg_ud(),
333 retype(size
, BRW_REGISTER_TYPE_UD
),
334 fs_reg(0), BRW_CONDITIONAL_NZ
);
336 return BRW_PREDICATE_NORMAL
;
338 /* More recent platforms implement compliant behavior when a null
341 return BRW_PREDICATE_NONE
;
346 * Check whether the provided coordinates are within the image bounds
347 * and write the comparison result to f0.0. Returns an appropriate
348 * predication mode to use on subsequent image operations.
351 emit_bounds_check(const fs_builder
&bld
, const fs_reg
&image
,
352 const fs_reg
&addr
, unsigned dims
)
354 const fs_reg size
= offset(image
, bld
, BRW_IMAGE_PARAM_SIZE_OFFSET
);
356 for (unsigned c
= 0; c
< dims
; ++c
)
357 set_predicate(c
== 0 ? BRW_PREDICATE_NONE
: BRW_PREDICATE_NORMAL
,
358 bld
.CMP(bld
.null_reg_ud(),
359 offset(retype(addr
, BRW_REGISTER_TYPE_UD
), bld
, c
),
360 offset(size
, bld
, c
),
363 return BRW_PREDICATE_NORMAL
;
367 namespace image_coordinates
{
369 * Return the total number of coordinates needed to address a texel of
370 * the surface, which may be more than the sum of \p surf_dims and \p
371 * arr_dims if padding is required.
374 num_image_coordinates(const fs_builder
&bld
,
375 unsigned surf_dims
, unsigned arr_dims
,
378 /* HSW in vec4 mode and our software coordinate handling for untyped
379 * reads want the array index to be at the Z component.
381 const bool array_index_at_z
=
382 !image_format_info::has_matching_typed_format(
383 bld
.shader
->devinfo
, format
);
384 const unsigned zero_dims
=
385 ((surf_dims
== 1 && arr_dims
== 1 && array_index_at_z
) ? 1 : 0);
387 return surf_dims
+ zero_dims
+ arr_dims
;
391 * Transform image coordinates into the form expected by the
395 emit_image_coordinates(const fs_builder
&bld
, const fs_reg
&addr
,
396 unsigned surf_dims
, unsigned arr_dims
,
399 const unsigned dims
=
400 num_image_coordinates(bld
, surf_dims
, arr_dims
, format
);
402 if (dims
> surf_dims
+ arr_dims
) {
403 assert(surf_dims
== 1 && arr_dims
== 1 && dims
== 3);
404 /* The array index is required to be passed in as the Z component,
405 * insert a zero at the Y component to shift it to the right
408 * FINISHME: Factor out this frequently recurring pattern into a
411 const fs_reg srcs
[] = { addr
, fs_reg(0), offset(addr
, bld
, 1) };
412 const fs_reg dst
= bld
.vgrf(addr
.type
, dims
);
413 bld
.LOAD_PAYLOAD(dst
, srcs
, dims
, 0);
421 * Calculate the offset in memory of the texel given by \p coord.
423 * This is meant to be used with untyped surface messages to access a
424 * tiled surface, what involves taking into account the tiling and
425 * swizzling modes of the surface manually so it will hopefully not
428 * The tiling algorithm implemented here matches either the X or Y
429 * tiling layouts supported by the hardware depending on the tiling
430 * coefficients passed to the program as uniforms. See Volume 1 Part 2
431 * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
432 * explanation of the hardware tiling format.
435 emit_address_calculation(const fs_builder
&bld
, const fs_reg
&image
,
436 const fs_reg
&coord
, unsigned dims
)
438 const brw_device_info
*devinfo
= bld
.shader
->devinfo
;
439 const fs_reg off
= offset(image
, bld
, BRW_IMAGE_PARAM_OFFSET_OFFSET
);
440 const fs_reg stride
= offset(image
, bld
, BRW_IMAGE_PARAM_STRIDE_OFFSET
);
441 const fs_reg tile
= offset(image
, bld
, BRW_IMAGE_PARAM_TILING_OFFSET
);
442 const fs_reg swz
= offset(image
, bld
, BRW_IMAGE_PARAM_SWIZZLING_OFFSET
);
443 const fs_reg addr
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
444 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
445 const fs_reg minor
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
446 const fs_reg major
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 2);
447 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
);
449 /* Shift the coordinates by the fixed surface offset. It may be
450 * non-zero if the image is a single slice of a higher-dimensional
451 * surface, or if a non-zero mipmap level of the surface is bound to
452 * the pipeline. The offset needs to be applied here rather than at
453 * surface state set-up time because the desired slice-level may
454 * start mid-tile, so simply shifting the surface base address
455 * wouldn't give a well-formed tiled surface in the general case.
457 for (unsigned c
= 0; c
< 2; ++c
)
458 bld
.ADD(offset(addr
, bld
, c
), offset(off
, bld
, c
),
460 offset(retype(coord
, BRW_REGISTER_TYPE_UD
), bld
, c
) :
463 /* The layout of 3-D textures in memory is sort-of like a tiling
464 * format. At each miplevel, the slices are arranged in rows of
465 * 2^level slices per row. The slice row is stored in tmp.y and
466 * the slice within the row is stored in tmp.x.
468 * The layout of 2-D array textures and cubemaps is much simpler:
469 * Depending on whether the ARYSPC_LOD0 layout is in use it will be
470 * stored in memory as an array of slices, each one being a 2-D
471 * arrangement of miplevels, or as a 2D arrangement of miplevels,
472 * each one being an array of slices. In either case the separation
473 * between slices of the same LOD is equal to the qpitch value
474 * provided as stride.w.
476 * This code can be made to handle either 2D arrays and 3D textures
477 * by passing in the miplevel as tile.z for 3-D textures and 0 in
478 * tile.z for 2-D array textures.
480 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
481 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
482 * of the hardware 3D texture and 2D array layouts.
485 /* Decompose z into a major (tmp.y) and a minor (tmp.x)
488 bld
.BFE(offset(tmp
, bld
, 0), offset(tile
, bld
, 2), fs_reg(0),
489 offset(retype(coord
, BRW_REGISTER_TYPE_UD
), bld
, 2));
490 bld
.SHR(offset(tmp
, bld
, 1),
491 offset(retype(coord
, BRW_REGISTER_TYPE_UD
), bld
, 2),
492 offset(tile
, bld
, 2));
494 /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
497 for (unsigned c
= 0; c
< 2; ++c
) {
498 bld
.MUL(offset(tmp
, bld
, c
),
499 offset(stride
, bld
, 2 + c
), offset(tmp
, bld
, c
));
500 bld
.ADD(offset(addr
, bld
, c
),
501 offset(addr
, bld
, c
), offset(tmp
, bld
, c
));
506 /* Calculate the major/minor x and y indices. In order to
507 * accommodate both X and Y tiling, the Y-major tiling format is
508 * treated as being a bunch of narrow X-tiles placed next to each
509 * other. This means that the tile width for Y-tiling is actually
510 * the width of one sub-column of the Y-major tile where each 4K
511 * tile has 8 512B sub-columns.
513 * The major Y value is the row of tiles in which the pixel lives.
514 * The major X value is the tile sub-column in which the pixel
515 * lives; for X tiling, this is the same as the tile column, for Y
516 * tiling, each tile has 8 sub-columns. The minor X and Y indices
517 * are the position within the sub-column.
519 for (unsigned c
= 0; c
< 2; ++c
) {
520 /* Calculate the minor x and y indices. */
521 bld
.BFE(offset(minor
, bld
, c
), offset(tile
, bld
, c
),
522 fs_reg(0), offset(addr
, bld
, c
));
524 /* Calculate the major x and y indices. */
525 bld
.SHR(offset(major
, bld
, c
),
526 offset(addr
, bld
, c
), offset(tile
, bld
, c
));
529 /* Calculate the texel index from the start of the tile row and
530 * the vertical coordinate of the row.
532 * tmp.x = (major.x << tile.y << tile.x) +
533 * (minor.y << tile.x) + minor.x
534 * tmp.y = major.y << tile.y
536 bld
.SHL(tmp
, major
, offset(tile
, bld
, 1));
537 bld
.ADD(tmp
, tmp
, offset(minor
, bld
, 1));
538 bld
.SHL(tmp
, tmp
, offset(tile
, bld
, 0));
539 bld
.ADD(tmp
, tmp
, minor
);
540 bld
.SHL(offset(tmp
, bld
, 1),
541 offset(major
, bld
, 1), offset(tile
, bld
, 1));
543 /* Add it to the start of the tile row. */
544 bld
.MUL(offset(tmp
, bld
, 1),
545 offset(tmp
, bld
, 1), offset(stride
, bld
, 1));
546 bld
.ADD(tmp
, tmp
, offset(tmp
, bld
, 1));
548 /* Multiply by the Bpp value. */
549 bld
.MUL(dst
, tmp
, stride
);
551 if (devinfo
->gen
< 8 && !devinfo
->is_baytrail
) {
552 /* Take into account the two dynamically specified shifts.
553 * Both need are used to implement swizzling of X-tiled
554 * surfaces. For Y-tiled surfaces only one bit needs to be
555 * XOR-ed with bit 6 of the memory address, so a swz value of
556 * 0xff (actually interpreted as 31 by the hardware) will be
557 * provided to cause the relevant bit of tmp.y to be zero and
558 * turn the first XOR into the identity. For linear surfaces
559 * or platforms lacking address swizzling both shifts will be
560 * 0xff causing the relevant bits of both tmp.x and .y to be
561 * zero, what effectively disables swizzling.
563 for (unsigned c
= 0; c
< 2; ++c
)
564 bld
.SHR(offset(tmp
, bld
, c
), dst
, offset(swz
, bld
, c
));
566 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
567 bld
.XOR(tmp
, tmp
, offset(tmp
, bld
, 1));
568 bld
.AND(tmp
, tmp
, fs_reg(1 << 6));
569 bld
.XOR(dst
, dst
, tmp
);
573 /* Multiply by the Bpp/stride value. Note that the addr.y may be
574 * non-zero even if the image is one-dimensional because a
575 * vertical offset may have been applied above to select a
576 * non-zero slice or level of a higher-dimensional texture.
578 bld
.MUL(offset(addr
, bld
, 1),
579 offset(addr
, bld
, 1), offset(stride
, bld
, 1));
580 bld
.ADD(addr
, addr
, offset(addr
, bld
, 1));
581 bld
.MUL(dst
, addr
, stride
);
588 namespace image_format_conversion
{
589 using image_format_info::color_u
;
593 * Maximum representable value in an unsigned integer with the given
604 * Pack the vector \p src in a bitfield given the per-component bit
605 * shifts and widths. Note that bitfield components are not allowed to
606 * cross 32-bit boundaries.
609 emit_pack(const fs_builder
&bld
, const fs_reg
&src
,
610 const color_u
&shifts
, const color_u
&widths
)
612 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 4);
615 for (unsigned c
= 0; c
< 4; ++c
) {
617 const fs_reg tmp
= bld
.vgrf(BRW_REGISTER_TYPE_UD
);
619 /* Shift each component left to the correct bitfield position. */
620 bld
.SHL(tmp
, offset(src
, bld
, c
), fs_reg(shifts
[c
] % 32));
622 /* Add everything up. */
623 if (seen
[shifts
[c
] / 32]) {
624 bld
.OR(offset(dst
, bld
, shifts
[c
] / 32),
625 offset(dst
, bld
, shifts
[c
] / 32), tmp
);
627 bld
.MOV(offset(dst
, bld
, shifts
[c
] / 32), tmp
);
628 seen
[shifts
[c
] / 32] = true;
637 * Unpack a vector from the bitfield \p src given the per-component bit
638 * shifts and widths. Note that bitfield components are not allowed to
639 * cross 32-bit boundaries.
642 emit_unpack(const fs_builder
&bld
, const fs_reg
&src
,
643 const color_u
&shifts
, const color_u
&widths
)
645 const fs_reg dst
= bld
.vgrf(src
.type
, 4);
647 for (unsigned c
= 0; c
< 4; ++c
) {
649 /* Shift left to discard the most significant bits. */
650 bld
.SHL(offset(dst
, bld
, c
),
651 offset(src
, bld
, shifts
[c
] / 32),
652 fs_reg(32 - shifts
[c
] % 32 - widths
[c
]));
654 /* Shift back to the least significant bits using an arithmetic
655 * shift to get sign extension on signed types.
657 bld
.ASR(offset(dst
, bld
, c
),
658 offset(dst
, bld
, c
), fs_reg(32 - widths
[c
]));
666 * Convert an integer vector into another integer vector of the
667 * specified bit widths, properly handling overflow.
670 emit_convert_to_integer(const fs_builder
&bld
, const fs_reg
&src
,
671 const color_u
&widths
, bool is_signed
)
673 const unsigned s
= (is_signed
? 1 : 0);
674 const fs_reg dst
= bld
.vgrf(
675 is_signed
? BRW_REGISTER_TYPE_D
: BRW_REGISTER_TYPE_UD
, 4);
676 assert(src
.type
== dst
.type
);
678 for (unsigned c
= 0; c
< 4; ++c
) {
680 /* Clamp to the maximum value. */
681 bld
.emit_minmax(offset(dst
, bld
, c
), offset(src
, bld
, c
),
682 fs_reg((int)scale(widths
[c
] - s
)),
685 /* Clamp to the minimum value. */
687 bld
.emit_minmax(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
688 fs_reg(-(int)scale(widths
[c
] - s
) - 1),
697 * Convert a normalized fixed-point vector of the specified signedness
698 * and bit widths into a floating point vector.
701 emit_convert_from_scaled(const fs_builder
&bld
, const fs_reg
&src
,
702 const color_u
&widths
, bool is_signed
)
704 const unsigned s
= (is_signed
? 1 : 0);
705 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_F
, 4);
707 for (unsigned c
= 0; c
< 4; ++c
) {
709 /* Convert to float. */
710 bld
.MOV(offset(dst
, bld
, c
), offset(src
, bld
, c
));
712 /* Divide by the normalization constants. */
713 bld
.MUL(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
714 fs_reg(1.0f
/ scale(widths
[c
] - s
)));
716 /* Clamp to the minimum value. */
718 bld
.emit_minmax(offset(dst
, bld
, c
),
719 offset(dst
, bld
, c
), fs_reg(-1.0f
),
727 * Convert a floating-point vector into a normalized fixed-point vector
728 * of the specified signedness and bit widths.
731 emit_convert_to_scaled(const fs_builder
&bld
, const fs_reg
&src
,
732 const color_u
&widths
, bool is_signed
)
734 const unsigned s
= (is_signed
? 1 : 0);
735 const fs_reg dst
= bld
.vgrf(
736 is_signed
? BRW_REGISTER_TYPE_D
: BRW_REGISTER_TYPE_UD
, 4);
737 const fs_reg fdst
= retype(dst
, BRW_REGISTER_TYPE_F
);
739 for (unsigned c
= 0; c
< 4; ++c
) {
741 /* Clamp the normalized floating-point argument. */
743 bld
.emit_minmax(offset(fdst
, bld
, c
), offset(src
, bld
, c
),
744 fs_reg(-1.0f
), BRW_CONDITIONAL_GE
);
746 bld
.emit_minmax(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
),
747 fs_reg(1.0f
), BRW_CONDITIONAL_L
);
749 set_saturate(true, bld
.MOV(offset(fdst
, bld
, c
),
750 offset(src
, bld
, c
)));
753 /* Multiply by the normalization constants. */
754 bld
.MUL(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
),
755 fs_reg((float)scale(widths
[c
] - s
)));
757 /* Convert to integer. */
758 bld
.RNDE(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
));
759 bld
.MOV(offset(dst
, bld
, c
), offset(fdst
, bld
, c
));
767 * Convert a floating point vector of the specified bit widths into a
768 * 32-bit floating point vector.
771 emit_convert_from_float(const fs_builder
&bld
, const fs_reg
&src
,
772 const color_u
&widths
)
774 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 4);
775 const fs_reg fdst
= retype(dst
, BRW_REGISTER_TYPE_F
);
777 for (unsigned c
= 0; c
< 4; ++c
) {
779 bld
.MOV(offset(dst
, bld
, c
), offset(src
, bld
, c
));
781 /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
782 * This works because they have a 5-bit exponent just like the
783 * 16-bit floating point format, and they have no sign bit.
786 bld
.SHL(offset(dst
, bld
, c
),
787 offset(dst
, bld
, c
), fs_reg(15 - widths
[c
]));
789 /* Convert to 32-bit floating point. */
790 bld
.F16TO32(offset(fdst
, bld
, c
), offset(dst
, bld
, c
));
798 * Convert a vector into a floating point vector of the specified bit
802 emit_convert_to_float(const fs_builder
&bld
, const fs_reg
&src
,
803 const color_u
&widths
)
805 const fs_reg dst
= bld
.vgrf(BRW_REGISTER_TYPE_UD
, 4);
806 const fs_reg fdst
= retype(dst
, BRW_REGISTER_TYPE_F
);
808 for (unsigned c
= 0; c
< 4; ++c
) {
810 bld
.MOV(offset(fdst
, bld
, c
), offset(src
, bld
, c
));
812 /* Clamp to the minimum value. */
814 bld
.emit_minmax(offset(fdst
, bld
, c
), offset(fdst
, bld
, c
),
815 fs_reg(0.0f
), BRW_CONDITIONAL_GE
);
817 /* Convert to 16-bit floating-point. */
818 bld
.F32TO16(offset(dst
, bld
, c
), offset(fdst
, bld
, c
));
820 /* Discard the least significant bits to get floating point
821 * numbers of the requested width. This works because the
822 * 10-bit and 11-bit floating point formats have a 5-bit
823 * exponent just like the 16-bit format, and they have no sign
827 bld
.SHR(offset(dst
, bld
, c
), offset(dst
, bld
, c
),
828 fs_reg(15 - widths
[c
]));
836 * Fill missing components of a vector with 0, 0, 0, 1.
839 emit_pad(const fs_builder
&bld
, const fs_reg
&src
,
840 const color_u
&widths
)
842 const fs_reg dst
= bld
.vgrf(src
.type
, 4);
843 const unsigned pad
[] = { 0, 0, 0, 1 };
845 for (unsigned c
= 0; c
< 4; ++c
)
846 bld
.MOV(offset(dst
, bld
, c
),
847 widths
[c
] ? offset(src
, bld
, c
) : fs_reg(pad
[c
]));
855 namespace image_access
{
857 * Load a vector from a surface of the given format and dimensionality
858 * at the given coordinates. \p surf_dims and \p arr_dims give the
859 * number of non-array and array coordinates of the image respectively.
862 emit_image_load(const fs_builder
&bld
,
863 const fs_reg
&image
, const fs_reg
&addr
,
864 unsigned surf_dims
, unsigned arr_dims
,
867 using namespace image_format_info
;
868 using namespace image_format_conversion
;
869 using namespace image_validity
;
870 using namespace image_coordinates
;
871 using namespace surface_access
;
872 const brw_device_info
*devinfo
= bld
.shader
->devinfo
;
873 const mesa_format lower_format
=
874 brw_lower_mesa_image_format(devinfo
, format
);
877 /* Transform the image coordinates into actual surface coordinates. */
879 emit_image_coordinates(bld
, addr
, surf_dims
, arr_dims
, format
);
880 const unsigned dims
=
881 num_image_coordinates(bld
, surf_dims
, arr_dims
, format
);
883 if (has_matching_typed_format(devinfo
, format
)) {
884 /* Hopefully we get here most of the time... */
885 tmp
= emit_typed_read(bld
, image
, saddr
, dims
,
886 _mesa_format_num_components(lower_format
));
888 /* Untyped surface reads return 32 bits of the surface per
889 * component, without any sort of unpacking or type conversion,
891 const unsigned size
= _mesa_get_format_bytes(format
) / 4;
893 /* they don't properly handle out of bounds access, so we have to
894 * check manually if the coordinates are valid and predicate the
895 * surface read on the result,
897 const brw_predicate pred
=
898 emit_bounds_check(bld
, image
, saddr
, dims
);
900 /* and they don't know about surface coordinates, we need to
901 * convert them to a raw memory offset.
903 const fs_reg laddr
= emit_address_calculation(bld
, image
, saddr
, dims
);
905 tmp
= emit_untyped_read(bld
, image
, laddr
, 1, size
, pred
);
907 /* An out of bounds surface access should give zero as result. */
908 for (unsigned c
= 0; c
< size
; ++c
)
909 set_predicate(pred
, bld
.SEL(offset(tmp
, bld
, c
),
910 offset(tmp
, bld
, c
), fs_reg(0)));
913 /* Set the register type to D instead of UD if the data type is
914 * represented as a signed integer in memory so that sign extension
915 * is handled correctly by unpack.
917 if (needs_sign_extension(format
))
918 tmp
= retype(tmp
, BRW_REGISTER_TYPE_D
);
920 if (!has_supported_bit_layout(devinfo
, format
)) {
921 /* Unpack individual vector components from the bitfield if the
922 * hardware is unable to do it for us.
924 if (has_split_bit_layout(devinfo
, format
))
925 tmp
= emit_pack(bld
, tmp
, get_bit_shifts(lower_format
),
926 get_bit_widths(lower_format
));
928 tmp
= emit_unpack(bld
, tmp
, get_bit_shifts(format
),
929 get_bit_widths(format
));
931 } else if ((needs_sign_extension(format
) &&
932 !is_conversion_trivial(devinfo
, format
)) ||
933 has_undefined_high_bits(devinfo
, format
)) {
934 /* Perform a trivial unpack even though the bit layout matches in
935 * order to get the most significant bits of each component
936 * initialized properly.
938 tmp
= emit_unpack(bld
, tmp
, color_u(0, 32, 64, 96),
939 get_bit_widths(format
));
942 if (!_mesa_is_format_integer(format
)) {
943 if (is_conversion_trivial(devinfo
, format
)) {
944 /* Just need to cast the vector to the target type. */
945 tmp
= retype(tmp
, BRW_REGISTER_TYPE_F
);
947 /* Do the right sort of type conversion to float. */
948 if (_mesa_get_format_datatype(format
) == GL_FLOAT
)
949 tmp
= emit_convert_from_float(
950 bld
, tmp
, get_bit_widths(format
));
952 tmp
= emit_convert_from_scaled(
953 bld
, tmp
, get_bit_widths(format
),
954 _mesa_is_format_signed(format
));
958 /* Initialize missing components of the result. */
959 return emit_pad(bld
, tmp
, get_bit_widths(format
));
963 * Store a vector in a surface of the given format and dimensionality at
964 * the given coordinates. \p surf_dims and \p arr_dims give the number
965 * of non-array and array coordinates of the image respectively.
968 emit_image_store(const fs_builder
&bld
, const fs_reg
&image
,
969 const fs_reg
&addr
, const fs_reg
&src
,
970 unsigned surf_dims
, unsigned arr_dims
,
973 using namespace image_format_info
;
974 using namespace image_format_conversion
;
975 using namespace image_validity
;
976 using namespace image_coordinates
;
977 using namespace surface_access
;
978 const brw_device_info
*devinfo
= bld
.shader
->devinfo
;
980 /* Transform the image coordinates into actual surface coordinates. */
982 emit_image_coordinates(bld
, addr
, surf_dims
, arr_dims
, format
);
983 const unsigned dims
=
984 num_image_coordinates(bld
, surf_dims
, arr_dims
, format
);
986 if (format
== MESA_FORMAT_NONE
) {
987 /* We don't know what the format is, but that's fine because it
988 * implies write-only access, and typed surface writes are always
989 * able to take care of type conversion and packing for us.
991 emit_typed_write(bld
, image
, saddr
, src
, dims
, 4);
994 const mesa_format lower_format
=
995 brw_lower_mesa_image_format(devinfo
, format
);
998 if (!is_conversion_trivial(devinfo
, format
)) {
999 /* Do the right sort of type conversion. */
1000 if (_mesa_get_format_datatype(format
) == GL_FLOAT
)
1001 tmp
= emit_convert_to_float(bld
, tmp
, get_bit_widths(format
));
1003 else if (_mesa_is_format_integer(format
))
1004 tmp
= emit_convert_to_integer(bld
, tmp
, get_bit_widths(format
),
1005 _mesa_is_format_signed(format
));
1008 tmp
= emit_convert_to_scaled(bld
, tmp
, get_bit_widths(format
),
1009 _mesa_is_format_signed(format
));
1012 /* We're down to bit manipulation at this point. */
1013 tmp
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
1015 if (!has_supported_bit_layout(devinfo
, format
)) {
1016 /* Pack the vector components into a bitfield if the hardware
1017 * is unable to do it for us.
1019 if (has_split_bit_layout(devinfo
, format
))
1020 tmp
= emit_unpack(bld
, tmp
, get_bit_shifts(lower_format
),
1021 get_bit_widths(lower_format
));
1024 tmp
= emit_pack(bld
, tmp
, get_bit_shifts(format
),
1025 get_bit_widths(format
));
1028 if (has_matching_typed_format(devinfo
, format
)) {
1029 /* Hopefully we get here most of the time... */
1030 emit_typed_write(bld
, image
, saddr
, tmp
, dims
,
1031 _mesa_format_num_components(lower_format
));
1034 /* Untyped surface writes store 32 bits of the surface per
1035 * component, without any sort of packing or type conversion,
1037 const unsigned size
= _mesa_get_format_bytes(format
) / 4;
1039 /* they don't properly handle out of bounds access, so we have
1040 * to check manually if the coordinates are valid and predicate
1041 * the surface write on the result,
1043 const brw_predicate pred
=
1044 emit_bounds_check(bld
, image
, saddr
, dims
);
1046 /* and, phew, they don't know about surface coordinates, we
1047 * need to convert them to a raw memory offset.
1049 const fs_reg laddr
= emit_address_calculation(
1050 bld
, image
, saddr
, dims
);
1052 emit_untyped_write(bld
, image
, laddr
, tmp
, 1, size
, pred
);
1058 * Perform an atomic read-modify-write operation in a surface of the
1059 * given dimensionality at the given coordinates. \p surf_dims and \p
1060 * arr_dims give the number of non-array and array coordinates of the
1061 * image respectively. Main building block of the imageAtomic GLSL
1065 emit_image_atomic(const fs_builder
&bld
,
1066 const fs_reg
&image
, const fs_reg
&addr
,
1067 const fs_reg
&src0
, const fs_reg
&src1
,
1068 unsigned surf_dims
, unsigned arr_dims
,
1069 unsigned rsize
, unsigned op
)
1071 using namespace image_validity
;
1072 using namespace image_coordinates
;
1073 using namespace surface_access
;
1074 /* Avoid performing an atomic operation on an unbound surface. */
1075 const brw_predicate pred
= emit_surface_check(bld
, image
);
1077 /* Transform the image coordinates into actual surface coordinates. */
1078 const fs_reg saddr
=
1079 emit_image_coordinates(bld
, addr
, surf_dims
, arr_dims
,
1080 MESA_FORMAT_R_UINT32
);
1081 const unsigned dims
=
1082 num_image_coordinates(bld
, surf_dims
, arr_dims
,
1083 MESA_FORMAT_R_UINT32
);
1085 /* Thankfully we can do without untyped atomics here. */
1086 const fs_reg tmp
= emit_typed_atomic(bld
, image
, saddr
, src0
, src1
,
1087 dims
, rsize
, op
, pred
);
1089 /* An unbound surface access should give zero as result. */
1091 set_predicate(pred
, bld
.SEL(tmp
, tmp
, fs_reg(0)));