i965/fs: Fix hang on IVB and VLV with image format mismatch.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_surface_builder.cpp
1 /*
2 * Copyright © 2013-2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs_surface_builder.h"
25 #include "brw_fs.h"
26
27 using namespace brw;
28
29 namespace brw {
30 namespace surface_access {
31 namespace {
32 /**
33 * Generate a logical send opcode for a surface message and return
34 * the result.
35 */
36 fs_reg
37 emit_send(const fs_builder &bld, enum opcode opcode,
38 const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
39 unsigned dims, unsigned arg, unsigned rsize,
40 brw_predicate pred = BRW_PREDICATE_NONE)
41 {
42 /* Reduce the dynamically uniform surface index to a single
43 * scalar.
44 */
45 const fs_reg usurface = bld.emit_uniformize(surface);
46 const fs_reg srcs[] = {
47 addr, src, usurface, fs_reg(dims), fs_reg(arg)
48 };
49 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
50 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
51
52 inst->regs_written = rsize * bld.dispatch_width() / 8;
53 inst->predicate = pred;
54 return dst;
55 }
56 }
57
58 /**
59 * Emit an untyped surface read opcode. \p dims determines the number
60 * of components of the address and \p size the number of components of
61 * the returned value.
62 */
63 fs_reg
64 emit_untyped_read(const fs_builder &bld,
65 const fs_reg &surface, const fs_reg &addr,
66 unsigned dims, unsigned size,
67 brw_predicate pred)
68 {
69 return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
70 addr, fs_reg(), surface, dims, size, size, pred);
71 }
72
73 /**
74 * Emit an untyped surface write opcode. \p dims determines the number
75 * of components of the address and \p size the number of components of
76 * the argument.
77 */
78 void
79 emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
80 const fs_reg &addr, const fs_reg &src,
81 unsigned dims, unsigned size,
82 brw_predicate pred)
83 {
84 emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
85 addr, src, surface, dims, size, 0, pred);
86 }
87
88 /**
89 * Emit an untyped surface atomic opcode. \p dims determines the number
90 * of components of the address and \p rsize the number of components of
91 * the returned value (either zero or one).
92 */
93 fs_reg
94 emit_untyped_atomic(const fs_builder &bld,
95 const fs_reg &surface, const fs_reg &addr,
96 const fs_reg &src0, const fs_reg &src1,
97 unsigned dims, unsigned rsize, unsigned op,
98 brw_predicate pred)
99 {
100 /* FINISHME: Factor out this frequently recurring pattern into a
101 * helper function.
102 */
103 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
104 const fs_reg srcs[] = { src0, src1 };
105 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
106 bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
107
108 return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
109 addr, tmp, surface, dims, op, rsize, pred);
110 }
111
112 /**
113 * Emit a typed surface read opcode. \p dims determines the number of
114 * components of the address and \p size the number of components of the
115 * returned value.
116 */
117 fs_reg
118 emit_typed_read(const fs_builder &bld, const fs_reg &surface,
119 const fs_reg &addr, unsigned dims, unsigned size)
120 {
121 return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
122 addr, fs_reg(), surface, dims, size, size);
123 }
124
125 /**
126 * Emit a typed surface write opcode. \p dims determines the number of
127 * components of the address and \p size the number of components of the
128 * argument.
129 */
130 void
131 emit_typed_write(const fs_builder &bld, const fs_reg &surface,
132 const fs_reg &addr, const fs_reg &src,
133 unsigned dims, unsigned size)
134 {
135 emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
136 addr, src, surface, dims, size, 0);
137 }
138
139 /**
140 * Emit a typed surface atomic opcode. \p dims determines the number of
141 * components of the address and \p rsize the number of components of
142 * the returned value (either zero or one).
143 */
144 fs_reg
145 emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
146 const fs_reg &addr,
147 const fs_reg &src0, const fs_reg &src1,
148 unsigned dims, unsigned rsize, unsigned op,
149 brw_predicate pred)
150 {
151 /* FINISHME: Factor out this frequently recurring pattern into a
152 * helper function.
153 */
154 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
155 const fs_reg srcs[] = { src0, src1 };
156 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
157 bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
158
159 return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
160 addr, tmp, surface, dims, op, rsize);
161 }
162 }
163 }
164
165 namespace {
166 namespace image_format_info {
167 /**
168 * Simple 4-tuple of scalars used to pass around per-color component
169 * values.
170 */
171 struct color_u {
172 color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
173 {
174 }
175
176 color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
177 r(r), g(g), b(b), a(a)
178 {
179 }
180
181 unsigned
182 operator[](unsigned i) const
183 {
184 const unsigned xs[] = { r, g, b, a };
185 return xs[i];
186 }
187
188 unsigned r, g, b, a;
189 };
190
191 /**
192 * Return the per-channel bitfield widths for a given image format.
193 */
194 inline color_u
195 get_bit_widths(mesa_format format)
196 {
197 return color_u(_mesa_get_format_bits(format, GL_RED_BITS),
198 _mesa_get_format_bits(format, GL_GREEN_BITS),
199 _mesa_get_format_bits(format, GL_BLUE_BITS),
200 _mesa_get_format_bits(format, GL_ALPHA_BITS));
201 }
202
203 /**
204 * Return the per-channel bitfield shifts for a given image format.
205 */
206 inline color_u
207 get_bit_shifts(mesa_format format)
208 {
209 const color_u widths = get_bit_widths(format);
210 return color_u(0, widths.r, widths.r + widths.g,
211 widths.r + widths.g + widths.b);
212 }
213
214 /**
215 * Return true if all present components have the same bit width.
216 */
217 inline bool
218 is_homogeneous(mesa_format format)
219 {
220 const color_u widths = get_bit_widths(format);
221 return ((widths.g == 0 || widths.g == widths.r) &&
222 (widths.b == 0 || widths.b == widths.r) &&
223 (widths.a == 0 || widths.a == widths.r));
224 }
225
226 /**
227 * Return true if the format conversion boils down to a trivial copy.
228 */
229 inline bool
230 is_conversion_trivial(const brw_device_info *devinfo, mesa_format format)
231 {
232 return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
233 format == brw_lower_mesa_image_format(devinfo, format);
234 }
235
236 /**
237 * Return true if the hardware natively supports some format with
238 * compatible bitfield layout, but possibly different data types.
239 */
240 inline bool
241 has_supported_bit_layout(const brw_device_info *devinfo,
242 mesa_format format)
243 {
244 const color_u widths = get_bit_widths(format);
245 const color_u lower_widths = get_bit_widths(
246 brw_lower_mesa_image_format(devinfo, format));
247
248 return (widths.r == lower_widths.r &&
249 widths.g == lower_widths.g &&
250 widths.b == lower_widths.b &&
251 widths.a == lower_widths.a);
252 }
253
254 /**
255 * Return true if we are required to spread individual components over
256 * several components of the format used by the hardware (RG32 and
257 * friends implemented as RGBA16UI).
258 */
259 inline bool
260 has_split_bit_layout(const brw_device_info *devinfo, mesa_format format)
261 {
262 const mesa_format lower_format =
263 brw_lower_mesa_image_format(devinfo, format);
264
265 return (_mesa_format_num_components(format) <
266 _mesa_format_num_components(lower_format));
267 }
268
269 /**
270 * Return true unless we have to fall back to untyped surface access.
271 * Fail!
272 */
273 inline bool
274 has_matching_typed_format(const brw_device_info *devinfo,
275 mesa_format format)
276 {
277 return (_mesa_get_format_bytes(format) <= 4 ||
278 (_mesa_get_format_bytes(format) <= 8 &&
279 (devinfo->gen >= 8 || devinfo->is_haswell)) ||
280 devinfo->gen >= 9);
281 }
282
283 /**
284 * Return true if the hardware returns garbage in the unused high bits
285 * of each component. This may happen on IVB because we rely on the
286 * undocumented behavior that typed reads from surfaces of the
287 * unsupported R8 and R16 formats return useful data in their least
288 * significant bits.
289 */
290 inline bool
291 has_undefined_high_bits(const brw_device_info *devinfo,
292 mesa_format format)
293 {
294 const mesa_format lower_format =
295 brw_lower_mesa_image_format(devinfo, format);
296
297 return (devinfo->gen == 7 && !devinfo->is_haswell &&
298 (lower_format == MESA_FORMAT_R_UINT16 ||
299 lower_format == MESA_FORMAT_R_UINT8));
300 }
301
302 /**
303 * Return true if the format represents values as signed integers
304 * requiring sign extension when unpacking.
305 */
306 inline bool
307 needs_sign_extension(mesa_format format)
308 {
309 return (_mesa_get_format_datatype(format) == GL_SIGNED_NORMALIZED ||
310 _mesa_get_format_datatype(format) == GL_INT);
311 }
312 }
313
314 namespace image_validity {
315 /**
316 * Check whether the bound image is suitable for untyped access.
317 */
318 brw_predicate
319 emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
320 brw_predicate pred)
321 {
322 const brw_device_info *devinfo = bld.shader->devinfo;
323 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
324
325 if (devinfo->gen == 7 && !devinfo->is_haswell) {
326 /* Check whether the first stride component (i.e. the Bpp value)
327 * is greater than four, what on Gen7 indicates that a surface of
328 * type RAW has been bound for untyped access. Reading or writing
329 * to a surface of type other than RAW using untyped surface
330 * messages causes a hang on IVB and VLV.
331 */
332 set_predicate(pred,
333 bld.CMP(bld.null_reg_ud(), stride, fs_reg(4),
334 BRW_CONDITIONAL_G));
335
336 return BRW_PREDICATE_NORMAL;
337 } else {
338 /* More recent generations handle the format mismatch
339 * gracefully.
340 */
341 return pred;
342 }
343 }
344
345 /**
346 * Check whether there is an image bound at the given index and write
347 * the comparison result to f0.0. Returns an appropriate predication
348 * mode to use on subsequent image operations.
349 */
350 brw_predicate
351 emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
352 {
353 const brw_device_info *devinfo = bld.shader->devinfo;
354 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
355
356 if (devinfo->gen == 7 && !devinfo->is_haswell) {
357 /* Check the first component of the size field to find out if the
358 * image is bound. Necessary on IVB for typed atomics because
359 * they don't seem to respect null surfaces and will happily
360 * corrupt or read random memory when no image is bound.
361 */
362 bld.CMP(bld.null_reg_ud(),
363 retype(size, BRW_REGISTER_TYPE_UD),
364 fs_reg(0), BRW_CONDITIONAL_NZ);
365
366 return BRW_PREDICATE_NORMAL;
367 } else {
368 /* More recent platforms implement compliant behavior when a null
369 * surface is bound.
370 */
371 return BRW_PREDICATE_NONE;
372 }
373 }
374
375 /**
376 * Check whether the provided coordinates are within the image bounds
377 * and write the comparison result to f0.0. Returns an appropriate
378 * predication mode to use on subsequent image operations.
379 */
380 brw_predicate
381 emit_bounds_check(const fs_builder &bld, const fs_reg &image,
382 const fs_reg &addr, unsigned dims)
383 {
384 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
385
386 for (unsigned c = 0; c < dims; ++c)
387 set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
388 bld.CMP(bld.null_reg_ud(),
389 offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
390 offset(size, bld, c),
391 BRW_CONDITIONAL_L));
392
393 return BRW_PREDICATE_NORMAL;
394 }
395 }
396
397 namespace image_coordinates {
398 /**
399 * Return the total number of coordinates needed to address a texel of
400 * the surface, which may be more than the sum of \p surf_dims and \p
401 * arr_dims if padding is required.
402 */
403 unsigned
404 num_image_coordinates(const fs_builder &bld,
405 unsigned surf_dims, unsigned arr_dims,
406 mesa_format format)
407 {
408 /* HSW in vec4 mode and our software coordinate handling for untyped
409 * reads want the array index to be at the Z component.
410 */
411 const bool array_index_at_z =
412 !image_format_info::has_matching_typed_format(
413 bld.shader->devinfo, format);
414 const unsigned zero_dims =
415 ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
416
417 return surf_dims + zero_dims + arr_dims;
418 }
419
420 /**
421 * Transform image coordinates into the form expected by the
422 * implementation.
423 */
424 fs_reg
425 emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
426 unsigned surf_dims, unsigned arr_dims,
427 mesa_format format)
428 {
429 const unsigned dims =
430 num_image_coordinates(bld, surf_dims, arr_dims, format);
431
432 if (dims > surf_dims + arr_dims) {
433 assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
434 /* The array index is required to be passed in as the Z component,
435 * insert a zero at the Y component to shift it to the right
436 * position.
437 *
438 * FINISHME: Factor out this frequently recurring pattern into a
439 * helper function.
440 */
441 const fs_reg srcs[] = { addr, fs_reg(0), offset(addr, bld, 1) };
442 const fs_reg dst = bld.vgrf(addr.type, dims);
443 bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
444 return dst;
445 } else {
446 return addr;
447 }
448 }
449
450 /**
451 * Calculate the offset in memory of the texel given by \p coord.
452 *
453 * This is meant to be used with untyped surface messages to access a
454 * tiled surface, what involves taking into account the tiling and
455 * swizzling modes of the surface manually so it will hopefully not
456 * happen very often.
457 *
458 * The tiling algorithm implemented here matches either the X or Y
459 * tiling layouts supported by the hardware depending on the tiling
460 * coefficients passed to the program as uniforms. See Volume 1 Part 2
461 * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
462 * explanation of the hardware tiling format.
463 */
464 fs_reg
465 emit_address_calculation(const fs_builder &bld, const fs_reg &image,
466 const fs_reg &coord, unsigned dims)
467 {
468 const brw_device_info *devinfo = bld.shader->devinfo;
469 const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
470 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
471 const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
472 const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
473 const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
474 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
475 const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
476 const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
477 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
478
479 /* Shift the coordinates by the fixed surface offset. It may be
480 * non-zero if the image is a single slice of a higher-dimensional
481 * surface, or if a non-zero mipmap level of the surface is bound to
482 * the pipeline. The offset needs to be applied here rather than at
483 * surface state set-up time because the desired slice-level may
484 * start mid-tile, so simply shifting the surface base address
485 * wouldn't give a well-formed tiled surface in the general case.
486 */
487 for (unsigned c = 0; c < 2; ++c)
488 bld.ADD(offset(addr, bld, c), offset(off, bld, c),
489 (c < dims ?
490 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
491 fs_reg(0)));
492
493 /* The layout of 3-D textures in memory is sort-of like a tiling
494 * format. At each miplevel, the slices are arranged in rows of
495 * 2^level slices per row. The slice row is stored in tmp.y and
496 * the slice within the row is stored in tmp.x.
497 *
498 * The layout of 2-D array textures and cubemaps is much simpler:
499 * Depending on whether the ARYSPC_LOD0 layout is in use it will be
500 * stored in memory as an array of slices, each one being a 2-D
501 * arrangement of miplevels, or as a 2D arrangement of miplevels,
502 * each one being an array of slices. In either case the separation
503 * between slices of the same LOD is equal to the qpitch value
504 * provided as stride.w.
505 *
506 * This code can be made to handle either 2D arrays and 3D textures
507 * by passing in the miplevel as tile.z for 3-D textures and 0 in
508 * tile.z for 2-D array textures.
509 *
510 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
511 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
512 * of the hardware 3D texture and 2D array layouts.
513 */
514 if (dims > 2) {
515 /* Decompose z into a major (tmp.y) and a minor (tmp.x)
516 * index.
517 */
518 bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), fs_reg(0),
519 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
520 bld.SHR(offset(tmp, bld, 1),
521 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
522 offset(tile, bld, 2));
523
524 /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
525 * slice offset.
526 */
527 for (unsigned c = 0; c < 2; ++c) {
528 bld.MUL(offset(tmp, bld, c),
529 offset(stride, bld, 2 + c), offset(tmp, bld, c));
530 bld.ADD(offset(addr, bld, c),
531 offset(addr, bld, c), offset(tmp, bld, c));
532 }
533 }
534
535 if (dims > 1) {
536 /* Calculate the major/minor x and y indices. In order to
537 * accommodate both X and Y tiling, the Y-major tiling format is
538 * treated as being a bunch of narrow X-tiles placed next to each
539 * other. This means that the tile width for Y-tiling is actually
540 * the width of one sub-column of the Y-major tile where each 4K
541 * tile has 8 512B sub-columns.
542 *
543 * The major Y value is the row of tiles in which the pixel lives.
544 * The major X value is the tile sub-column in which the pixel
545 * lives; for X tiling, this is the same as the tile column, for Y
546 * tiling, each tile has 8 sub-columns. The minor X and Y indices
547 * are the position within the sub-column.
548 */
549 for (unsigned c = 0; c < 2; ++c) {
550 /* Calculate the minor x and y indices. */
551 bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
552 fs_reg(0), offset(addr, bld, c));
553
554 /* Calculate the major x and y indices. */
555 bld.SHR(offset(major, bld, c),
556 offset(addr, bld, c), offset(tile, bld, c));
557 }
558
559 /* Calculate the texel index from the start of the tile row and
560 * the vertical coordinate of the row.
561 * Equivalent to:
562 * tmp.x = (major.x << tile.y << tile.x) +
563 * (minor.y << tile.x) + minor.x
564 * tmp.y = major.y << tile.y
565 */
566 bld.SHL(tmp, major, offset(tile, bld, 1));
567 bld.ADD(tmp, tmp, offset(minor, bld, 1));
568 bld.SHL(tmp, tmp, offset(tile, bld, 0));
569 bld.ADD(tmp, tmp, minor);
570 bld.SHL(offset(tmp, bld, 1),
571 offset(major, bld, 1), offset(tile, bld, 1));
572
573 /* Add it to the start of the tile row. */
574 bld.MUL(offset(tmp, bld, 1),
575 offset(tmp, bld, 1), offset(stride, bld, 1));
576 bld.ADD(tmp, tmp, offset(tmp, bld, 1));
577
578 /* Multiply by the Bpp value. */
579 bld.MUL(dst, tmp, stride);
580
581 if (devinfo->gen < 8 && !devinfo->is_baytrail) {
582 /* Take into account the two dynamically specified shifts.
583 * Both need are used to implement swizzling of X-tiled
584 * surfaces. For Y-tiled surfaces only one bit needs to be
585 * XOR-ed with bit 6 of the memory address, so a swz value of
586 * 0xff (actually interpreted as 31 by the hardware) will be
587 * provided to cause the relevant bit of tmp.y to be zero and
588 * turn the first XOR into the identity. For linear surfaces
589 * or platforms lacking address swizzling both shifts will be
590 * 0xff causing the relevant bits of both tmp.x and .y to be
591 * zero, what effectively disables swizzling.
592 */
593 for (unsigned c = 0; c < 2; ++c)
594 bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
595
596 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
597 bld.XOR(tmp, tmp, offset(tmp, bld, 1));
598 bld.AND(tmp, tmp, fs_reg(1 << 6));
599 bld.XOR(dst, dst, tmp);
600 }
601
602 } else {
603 /* Multiply by the Bpp/stride value. Note that the addr.y may be
604 * non-zero even if the image is one-dimensional because a
605 * vertical offset may have been applied above to select a
606 * non-zero slice or level of a higher-dimensional texture.
607 */
608 bld.MUL(offset(addr, bld, 1),
609 offset(addr, bld, 1), offset(stride, bld, 1));
610 bld.ADD(addr, addr, offset(addr, bld, 1));
611 bld.MUL(dst, addr, stride);
612 }
613
614 return dst;
615 }
616 }
617
618 namespace image_format_conversion {
619 using image_format_info::color_u;
620
621 namespace {
622 /**
623 * Maximum representable value in an unsigned integer with the given
624 * number of bits.
625 */
626 inline unsigned
627 scale(unsigned n)
628 {
629 return (1 << n) - 1;
630 }
631 }
632
633 /**
634 * Pack the vector \p src in a bitfield given the per-component bit
635 * shifts and widths. Note that bitfield components are not allowed to
636 * cross 32-bit boundaries.
637 */
638 fs_reg
639 emit_pack(const fs_builder &bld, const fs_reg &src,
640 const color_u &shifts, const color_u &widths)
641 {
642 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
643 bool seen[4] = {};
644
645 for (unsigned c = 0; c < 4; ++c) {
646 if (widths[c]) {
647 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
648
649 /* Shift each component left to the correct bitfield position. */
650 bld.SHL(tmp, offset(src, bld, c), fs_reg(shifts[c] % 32));
651
652 /* Add everything up. */
653 if (seen[shifts[c] / 32]) {
654 bld.OR(offset(dst, bld, shifts[c] / 32),
655 offset(dst, bld, shifts[c] / 32), tmp);
656 } else {
657 bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
658 seen[shifts[c] / 32] = true;
659 }
660 }
661 }
662
663 return dst;
664 }
665
666 /**
667 * Unpack a vector from the bitfield \p src given the per-component bit
668 * shifts and widths. Note that bitfield components are not allowed to
669 * cross 32-bit boundaries.
670 */
671 fs_reg
672 emit_unpack(const fs_builder &bld, const fs_reg &src,
673 const color_u &shifts, const color_u &widths)
674 {
675 const fs_reg dst = bld.vgrf(src.type, 4);
676
677 for (unsigned c = 0; c < 4; ++c) {
678 if (widths[c]) {
679 /* Shift left to discard the most significant bits. */
680 bld.SHL(offset(dst, bld, c),
681 offset(src, bld, shifts[c] / 32),
682 fs_reg(32 - shifts[c] % 32 - widths[c]));
683
684 /* Shift back to the least significant bits using an arithmetic
685 * shift to get sign extension on signed types.
686 */
687 bld.ASR(offset(dst, bld, c),
688 offset(dst, bld, c), fs_reg(32 - widths[c]));
689 }
690 }
691
692 return dst;
693 }
694
695 /**
696 * Convert an integer vector into another integer vector of the
697 * specified bit widths, properly handling overflow.
698 */
699 fs_reg
700 emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
701 const color_u &widths, bool is_signed)
702 {
703 const unsigned s = (is_signed ? 1 : 0);
704 const fs_reg dst = bld.vgrf(
705 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
706 assert(src.type == dst.type);
707
708 for (unsigned c = 0; c < 4; ++c) {
709 if (widths[c]) {
710 /* Clamp to the maximum value. */
711 bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
712 fs_reg((int)scale(widths[c] - s)),
713 BRW_CONDITIONAL_L);
714
715 /* Clamp to the minimum value. */
716 if (is_signed)
717 bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
718 fs_reg(-(int)scale(widths[c] - s) - 1),
719 BRW_CONDITIONAL_GE);
720 }
721 }
722
723 return dst;
724 }
725
726 /**
727 * Convert a normalized fixed-point vector of the specified signedness
728 * and bit widths into a floating point vector.
729 */
730 fs_reg
731 emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
732 const color_u &widths, bool is_signed)
733 {
734 const unsigned s = (is_signed ? 1 : 0);
735 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
736
737 for (unsigned c = 0; c < 4; ++c) {
738 if (widths[c]) {
739 /* Convert to float. */
740 bld.MOV(offset(dst, bld, c), offset(src, bld, c));
741
742 /* Divide by the normalization constants. */
743 bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
744 fs_reg(1.0f / scale(widths[c] - s)));
745
746 /* Clamp to the minimum value. */
747 if (is_signed)
748 bld.emit_minmax(offset(dst, bld, c),
749 offset(dst, bld, c), fs_reg(-1.0f),
750 BRW_CONDITIONAL_GE);
751 }
752 }
753 return dst;
754 }
755
756 /**
757 * Convert a floating-point vector into a normalized fixed-point vector
758 * of the specified signedness and bit widths.
759 */
760 fs_reg
761 emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
762 const color_u &widths, bool is_signed)
763 {
764 const unsigned s = (is_signed ? 1 : 0);
765 const fs_reg dst = bld.vgrf(
766 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
767 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
768
769 for (unsigned c = 0; c < 4; ++c) {
770 if (widths[c]) {
771 /* Clamp the normalized floating-point argument. */
772 if (is_signed) {
773 bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
774 fs_reg(-1.0f), BRW_CONDITIONAL_GE);
775
776 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
777 fs_reg(1.0f), BRW_CONDITIONAL_L);
778 } else {
779 set_saturate(true, bld.MOV(offset(fdst, bld, c),
780 offset(src, bld, c)));
781 }
782
783 /* Multiply by the normalization constants. */
784 bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
785 fs_reg((float)scale(widths[c] - s)));
786
787 /* Convert to integer. */
788 bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
789 bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
790 }
791 }
792
793 return dst;
794 }
795
796 /**
797 * Convert a floating point vector of the specified bit widths into a
798 * 32-bit floating point vector.
799 */
800 fs_reg
801 emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
802 const color_u &widths)
803 {
804 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
805 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
806
807 for (unsigned c = 0; c < 4; ++c) {
808 if (widths[c]) {
809 bld.MOV(offset(dst, bld, c), offset(src, bld, c));
810
811 /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
812 * This works because they have a 5-bit exponent just like the
813 * 16-bit floating point format, and they have no sign bit.
814 */
815 if (widths[c] < 16)
816 bld.SHL(offset(dst, bld, c),
817 offset(dst, bld, c), fs_reg(15 - widths[c]));
818
819 /* Convert to 32-bit floating point. */
820 bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
821 }
822 }
823
824 return fdst;
825 }
826
827 /**
828 * Convert a vector into a floating point vector of the specified bit
829 * widths.
830 */
831 fs_reg
832 emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
833 const color_u &widths)
834 {
835 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
836 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
837
838 for (unsigned c = 0; c < 4; ++c) {
839 if (widths[c]) {
840 bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
841
842 /* Clamp to the minimum value. */
843 if (widths[c] < 16)
844 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
845 fs_reg(0.0f), BRW_CONDITIONAL_GE);
846
847 /* Convert to 16-bit floating-point. */
848 bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
849
850 /* Discard the least significant bits to get floating point
851 * numbers of the requested width. This works because the
852 * 10-bit and 11-bit floating point formats have a 5-bit
853 * exponent just like the 16-bit format, and they have no sign
854 * bit.
855 */
856 if (widths[c] < 16)
857 bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
858 fs_reg(15 - widths[c]));
859 }
860 }
861
862 return dst;
863 }
864
865 /**
866 * Fill missing components of a vector with 0, 0, 0, 1.
867 */
868 fs_reg
869 emit_pad(const fs_builder &bld, const fs_reg &src,
870 const color_u &widths)
871 {
872 const fs_reg dst = bld.vgrf(src.type, 4);
873 const unsigned pad[] = { 0, 0, 0, 1 };
874
875 for (unsigned c = 0; c < 4; ++c)
876 bld.MOV(offset(dst, bld, c),
877 widths[c] ? offset(src, bld, c) : fs_reg(pad[c]));
878
879 return dst;
880 }
881 }
882 }
883
884 namespace brw {
885 namespace image_access {
886 /**
887 * Load a vector from a surface of the given format and dimensionality
888 * at the given coordinates. \p surf_dims and \p arr_dims give the
889 * number of non-array and array coordinates of the image respectively.
890 */
891 fs_reg
892 emit_image_load(const fs_builder &bld,
893 const fs_reg &image, const fs_reg &addr,
894 unsigned surf_dims, unsigned arr_dims,
895 mesa_format format)
896 {
897 using namespace image_format_info;
898 using namespace image_format_conversion;
899 using namespace image_validity;
900 using namespace image_coordinates;
901 using namespace surface_access;
902 const brw_device_info *devinfo = bld.shader->devinfo;
903 const mesa_format lower_format =
904 brw_lower_mesa_image_format(devinfo, format);
905 fs_reg tmp;
906
907 /* Transform the image coordinates into actual surface coordinates. */
908 const fs_reg saddr =
909 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
910 const unsigned dims =
911 num_image_coordinates(bld, surf_dims, arr_dims, format);
912
913 if (has_matching_typed_format(devinfo, format)) {
914 /* Hopefully we get here most of the time... */
915 tmp = emit_typed_read(bld, image, saddr, dims,
916 _mesa_format_num_components(lower_format));
917 } else {
918 /* Untyped surface reads return 32 bits of the surface per
919 * component, without any sort of unpacking or type conversion,
920 */
921 const unsigned size = _mesa_get_format_bytes(format) / 4;
922
923 /* they don't properly handle out of bounds access, so we have to
924 * check manually if the coordinates are valid and predicate the
925 * surface read on the result,
926 */
927 const brw_predicate pred =
928 emit_untyped_image_check(bld, image,
929 emit_bounds_check(bld, image,
930 saddr, dims));
931
932 /* and they don't know about surface coordinates, we need to
933 * convert them to a raw memory offset.
934 */
935 const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
936
937 tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
938
939 /* An out of bounds surface access should give zero as result. */
940 for (unsigned c = 0; c < size; ++c)
941 set_predicate(pred, bld.SEL(offset(tmp, bld, c),
942 offset(tmp, bld, c), fs_reg(0)));
943 }
944
945 /* Set the register type to D instead of UD if the data type is
946 * represented as a signed integer in memory so that sign extension
947 * is handled correctly by unpack.
948 */
949 if (needs_sign_extension(format))
950 tmp = retype(tmp, BRW_REGISTER_TYPE_D);
951
952 if (!has_supported_bit_layout(devinfo, format)) {
953 /* Unpack individual vector components from the bitfield if the
954 * hardware is unable to do it for us.
955 */
956 if (has_split_bit_layout(devinfo, format))
957 tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
958 get_bit_widths(lower_format));
959 else
960 tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
961 get_bit_widths(format));
962
963 } else if ((needs_sign_extension(format) &&
964 !is_conversion_trivial(devinfo, format)) ||
965 has_undefined_high_bits(devinfo, format)) {
966 /* Perform a trivial unpack even though the bit layout matches in
967 * order to get the most significant bits of each component
968 * initialized properly.
969 */
970 tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
971 get_bit_widths(format));
972 }
973
974 if (!_mesa_is_format_integer(format)) {
975 if (is_conversion_trivial(devinfo, format)) {
976 /* Just need to cast the vector to the target type. */
977 tmp = retype(tmp, BRW_REGISTER_TYPE_F);
978 } else {
979 /* Do the right sort of type conversion to float. */
980 if (_mesa_get_format_datatype(format) == GL_FLOAT)
981 tmp = emit_convert_from_float(
982 bld, tmp, get_bit_widths(format));
983 else
984 tmp = emit_convert_from_scaled(
985 bld, tmp, get_bit_widths(format),
986 _mesa_is_format_signed(format));
987 }
988 }
989
990 /* Initialize missing components of the result. */
991 return emit_pad(bld, tmp, get_bit_widths(format));
992 }
993
994 /**
995 * Store a vector in a surface of the given format and dimensionality at
996 * the given coordinates. \p surf_dims and \p arr_dims give the number
997 * of non-array and array coordinates of the image respectively.
998 */
999 void
1000 emit_image_store(const fs_builder &bld, const fs_reg &image,
1001 const fs_reg &addr, const fs_reg &src,
1002 unsigned surf_dims, unsigned arr_dims,
1003 mesa_format format)
1004 {
1005 using namespace image_format_info;
1006 using namespace image_format_conversion;
1007 using namespace image_validity;
1008 using namespace image_coordinates;
1009 using namespace surface_access;
1010 const brw_device_info *devinfo = bld.shader->devinfo;
1011
1012 /* Transform the image coordinates into actual surface coordinates. */
1013 const fs_reg saddr =
1014 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1015 const unsigned dims =
1016 num_image_coordinates(bld, surf_dims, arr_dims, format);
1017
1018 if (format == MESA_FORMAT_NONE) {
1019 /* We don't know what the format is, but that's fine because it
1020 * implies write-only access, and typed surface writes are always
1021 * able to take care of type conversion and packing for us.
1022 */
1023 emit_typed_write(bld, image, saddr, src, dims, 4);
1024
1025 } else {
1026 const mesa_format lower_format =
1027 brw_lower_mesa_image_format(devinfo, format);
1028 fs_reg tmp = src;
1029
1030 if (!is_conversion_trivial(devinfo, format)) {
1031 /* Do the right sort of type conversion. */
1032 if (_mesa_get_format_datatype(format) == GL_FLOAT)
1033 tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1034
1035 else if (_mesa_is_format_integer(format))
1036 tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1037 _mesa_is_format_signed(format));
1038
1039 else
1040 tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1041 _mesa_is_format_signed(format));
1042 }
1043
1044 /* We're down to bit manipulation at this point. */
1045 tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1046
1047 if (!has_supported_bit_layout(devinfo, format)) {
1048 /* Pack the vector components into a bitfield if the hardware
1049 * is unable to do it for us.
1050 */
1051 if (has_split_bit_layout(devinfo, format))
1052 tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1053 get_bit_widths(lower_format));
1054
1055 else
1056 tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1057 get_bit_widths(format));
1058 }
1059
1060 if (has_matching_typed_format(devinfo, format)) {
1061 /* Hopefully we get here most of the time... */
1062 emit_typed_write(bld, image, saddr, tmp, dims,
1063 _mesa_format_num_components(lower_format));
1064
1065 } else {
1066 /* Untyped surface writes store 32 bits of the surface per
1067 * component, without any sort of packing or type conversion,
1068 */
1069 const unsigned size = _mesa_get_format_bytes(format) / 4;
1070
1071 /* they don't properly handle out of bounds access, so we have
1072 * to check manually if the coordinates are valid and predicate
1073 * the surface write on the result,
1074 */
1075 const brw_predicate pred =
1076 emit_untyped_image_check(bld, image,
1077 emit_bounds_check(bld, image,
1078 saddr, dims));
1079
1080 /* and, phew, they don't know about surface coordinates, we
1081 * need to convert them to a raw memory offset.
1082 */
1083 const fs_reg laddr = emit_address_calculation(
1084 bld, image, saddr, dims);
1085
1086 emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1087 }
1088 }
1089 }
1090
1091 /**
1092 * Perform an atomic read-modify-write operation in a surface of the
1093 * given dimensionality at the given coordinates. \p surf_dims and \p
1094 * arr_dims give the number of non-array and array coordinates of the
1095 * image respectively. Main building block of the imageAtomic GLSL
1096 * built-ins.
1097 */
1098 fs_reg
1099 emit_image_atomic(const fs_builder &bld,
1100 const fs_reg &image, const fs_reg &addr,
1101 const fs_reg &src0, const fs_reg &src1,
1102 unsigned surf_dims, unsigned arr_dims,
1103 unsigned rsize, unsigned op)
1104 {
1105 using namespace image_validity;
1106 using namespace image_coordinates;
1107 using namespace surface_access;
1108 /* Avoid performing an atomic operation on an unbound surface. */
1109 const brw_predicate pred = emit_typed_atomic_check(bld, image);
1110
1111 /* Transform the image coordinates into actual surface coordinates. */
1112 const fs_reg saddr =
1113 emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1114 MESA_FORMAT_R_UINT32);
1115 const unsigned dims =
1116 num_image_coordinates(bld, surf_dims, arr_dims,
1117 MESA_FORMAT_R_UINT32);
1118
1119 /* Thankfully we can do without untyped atomics here. */
1120 const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1121 dims, rsize, op, pred);
1122
1123 /* An unbound surface access should give zero as result. */
1124 if (rsize)
1125 set_predicate(pred, bld.SEL(tmp, tmp, fs_reg(0)));
1126
1127 return tmp;
1128 }
1129 }
1130 }