i965: split EU defines to brw_eu_defines.h
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_surface_builder.cpp
1 /*
2 * Copyright © 2013-2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "isl/isl.h"
25 #include "brw_fs_surface_builder.h"
26 #include "brw_fs.h"
27
28 using namespace brw;
29
30 namespace brw {
31 namespace surface_access {
32 namespace {
33 /**
34 * Generate a logical send opcode for a surface message and return
35 * the result.
36 */
37 fs_reg
38 emit_send(const fs_builder &bld, enum opcode opcode,
39 const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
40 unsigned dims, unsigned arg, unsigned rsize,
41 brw_predicate pred = BRW_PREDICATE_NONE)
42 {
43 /* Reduce the dynamically uniform surface index to a single
44 * scalar.
45 */
46 const fs_reg usurface = bld.emit_uniformize(surface);
47 const fs_reg srcs[] = {
48 addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
49 };
50 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
51 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
52
53 inst->size_written = rsize * dst.component_size(inst->exec_size);
54 inst->predicate = pred;
55 return dst;
56 }
57 }
58
59 /**
60 * Emit an untyped surface read opcode. \p dims determines the number
61 * of components of the address and \p size the number of components of
62 * the returned value.
63 */
64 fs_reg
65 emit_untyped_read(const fs_builder &bld,
66 const fs_reg &surface, const fs_reg &addr,
67 unsigned dims, unsigned size,
68 brw_predicate pred)
69 {
70 return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
71 addr, fs_reg(), surface, dims, size, size, pred);
72 }
73
74 /**
75 * Emit an untyped surface write opcode. \p dims determines the number
76 * of components of the address and \p size the number of components of
77 * the argument.
78 */
79 void
80 emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
81 const fs_reg &addr, const fs_reg &src,
82 unsigned dims, unsigned size,
83 brw_predicate pred)
84 {
85 emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
86 addr, src, surface, dims, size, 0, pred);
87 }
88
89 /**
90 * Emit an untyped surface atomic opcode. \p dims determines the number
91 * of components of the address and \p rsize the number of components of
92 * the returned value (either zero or one).
93 */
94 fs_reg
95 emit_untyped_atomic(const fs_builder &bld,
96 const fs_reg &surface, const fs_reg &addr,
97 const fs_reg &src0, const fs_reg &src1,
98 unsigned dims, unsigned rsize, unsigned op,
99 brw_predicate pred)
100 {
101 /* FINISHME: Factor out this frequently recurring pattern into a
102 * helper function.
103 */
104 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
105 const fs_reg srcs[] = { src0, src1 };
106 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
107 bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
108
109 return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
110 addr, tmp, surface, dims, op, rsize, pred);
111 }
112
113 /**
114 * Emit a typed surface read opcode. \p dims determines the number of
115 * components of the address and \p size the number of components of the
116 * returned value.
117 */
118 fs_reg
119 emit_typed_read(const fs_builder &bld, const fs_reg &surface,
120 const fs_reg &addr, unsigned dims, unsigned size)
121 {
122 return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
123 addr, fs_reg(), surface, dims, size, size);
124 }
125
126 /**
127 * Emit a typed surface write opcode. \p dims determines the number of
128 * components of the address and \p size the number of components of the
129 * argument.
130 */
131 void
132 emit_typed_write(const fs_builder &bld, const fs_reg &surface,
133 const fs_reg &addr, const fs_reg &src,
134 unsigned dims, unsigned size)
135 {
136 emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
137 addr, src, surface, dims, size, 0);
138 }
139
140 /**
141 * Emit a typed surface atomic opcode. \p dims determines the number of
142 * components of the address and \p rsize the number of components of
143 * the returned value (either zero or one).
144 */
145 fs_reg
146 emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
147 const fs_reg &addr,
148 const fs_reg &src0, const fs_reg &src1,
149 unsigned dims, unsigned rsize, unsigned op,
150 brw_predicate pred)
151 {
152 /* FINISHME: Factor out this frequently recurring pattern into a
153 * helper function.
154 */
155 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
156 const fs_reg srcs[] = { src0, src1 };
157 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
158 bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
159
160 return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
161 addr, tmp, surface, dims, op, rsize);
162 }
163 }
164 }
165
166 namespace {
167 namespace image_format_info {
168 /* The higher compiler layers use the GL enums for image formats even if
169 * they come in from SPIR-V or Vulkan. We need to turn them into an ISL
170 * enum before we can use them.
171 */
172 enum isl_format
173 isl_format_for_gl_format(uint32_t gl_format)
174 {
175 switch (gl_format) {
176 case GL_R8: return ISL_FORMAT_R8_UNORM;
177 case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM;
178 case GL_R8UI: return ISL_FORMAT_R8_UINT;
179 case GL_R8I: return ISL_FORMAT_R8_SINT;
180 case GL_RG8: return ISL_FORMAT_R8G8_UNORM;
181 case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM;
182 case GL_RG8UI: return ISL_FORMAT_R8G8_UINT;
183 case GL_RG8I: return ISL_FORMAT_R8G8_SINT;
184 case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM;
185 case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM;
186 case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT;
187 case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT;
188 case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
189 case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM;
190 case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT;
191 case GL_R16: return ISL_FORMAT_R16_UNORM;
192 case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM;
193 case GL_R16F: return ISL_FORMAT_R16_FLOAT;
194 case GL_R16UI: return ISL_FORMAT_R16_UINT;
195 case GL_R16I: return ISL_FORMAT_R16_SINT;
196 case GL_RG16: return ISL_FORMAT_R16G16_UNORM;
197 case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM;
198 case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT;
199 case GL_RG16UI: return ISL_FORMAT_R16G16_UINT;
200 case GL_RG16I: return ISL_FORMAT_R16G16_SINT;
201 case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM;
202 case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM;
203 case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT;
204 case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT;
205 case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT;
206 case GL_R32F: return ISL_FORMAT_R32_FLOAT;
207 case GL_R32UI: return ISL_FORMAT_R32_UINT;
208 case GL_R32I: return ISL_FORMAT_R32_SINT;
209 case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT;
210 case GL_RG32UI: return ISL_FORMAT_R32G32_UINT;
211 case GL_RG32I: return ISL_FORMAT_R32G32_SINT;
212 case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT;
213 case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT;
214 case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT;
215 case GL_NONE: return ISL_FORMAT_UNSUPPORTED;
216 default:
217 assert(!"Invalid image format");
218 return ISL_FORMAT_UNSUPPORTED;
219 }
220 }
221
222 /**
223 * Simple 4-tuple of scalars used to pass around per-color component
224 * values.
225 */
226 struct color_u {
227 color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
228 {
229 }
230
231 color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
232 r(r), g(g), b(b), a(a)
233 {
234 }
235
236 unsigned
237 operator[](unsigned i) const
238 {
239 const unsigned xs[] = { r, g, b, a };
240 return xs[i];
241 }
242
243 unsigned r, g, b, a;
244 };
245
246 /**
247 * Return the per-channel bitfield widths for a given image format.
248 */
249 inline color_u
250 get_bit_widths(isl_format format)
251 {
252 const isl_format_layout *fmtl = isl_format_get_layout(format);
253
254 return color_u(fmtl->channels.r.bits,
255 fmtl->channels.g.bits,
256 fmtl->channels.b.bits,
257 fmtl->channels.a.bits);
258 }
259
260 /**
261 * Return the per-channel bitfield shifts for a given image format.
262 */
263 inline color_u
264 get_bit_shifts(isl_format format)
265 {
266 const color_u widths = get_bit_widths(format);
267 return color_u(0, widths.r, widths.r + widths.g,
268 widths.r + widths.g + widths.b);
269 }
270
271 /**
272 * Return true if all present components have the same bit width.
273 */
274 inline bool
275 is_homogeneous(isl_format format)
276 {
277 const color_u widths = get_bit_widths(format);
278 return ((widths.g == 0 || widths.g == widths.r) &&
279 (widths.b == 0 || widths.b == widths.r) &&
280 (widths.a == 0 || widths.a == widths.r));
281 }
282
283 /**
284 * Return true if the format conversion boils down to a trivial copy.
285 */
286 inline bool
287 is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
288 {
289 return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
290 format == isl_lower_storage_image_format(devinfo, format);
291 }
292
293 /**
294 * Return true if the hardware natively supports some format with
295 * compatible bitfield layout, but possibly different data types.
296 */
297 inline bool
298 has_supported_bit_layout(const gen_device_info *devinfo,
299 isl_format format)
300 {
301 const color_u widths = get_bit_widths(format);
302 const color_u lower_widths = get_bit_widths(
303 isl_lower_storage_image_format(devinfo, format));
304
305 return (widths.r == lower_widths.r &&
306 widths.g == lower_widths.g &&
307 widths.b == lower_widths.b &&
308 widths.a == lower_widths.a);
309 }
310
311 /**
312 * Return true if we are required to spread individual components over
313 * several components of the format used by the hardware (RG32 and
314 * friends implemented as RGBA16UI).
315 */
316 inline bool
317 has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
318 {
319 const isl_format lower_format =
320 isl_lower_storage_image_format(devinfo, format);
321
322 return (isl_format_get_num_channels(format) <
323 isl_format_get_num_channels(lower_format));
324 }
325
326 /**
327 * Return true if the hardware returns garbage in the unused high bits
328 * of each component. This may happen on IVB because we rely on the
329 * undocumented behavior that typed reads from surfaces of the
330 * unsupported R8 and R16 formats return useful data in their least
331 * significant bits.
332 */
333 inline bool
334 has_undefined_high_bits(const gen_device_info *devinfo,
335 isl_format format)
336 {
337 const isl_format lower_format =
338 isl_lower_storage_image_format(devinfo, format);
339
340 return (devinfo->gen == 7 && !devinfo->is_haswell &&
341 (lower_format == ISL_FORMAT_R16_UINT ||
342 lower_format == ISL_FORMAT_R8_UINT));
343 }
344
345 /**
346 * Return true if the format represents values as signed integers
347 * requiring sign extension when unpacking.
348 */
349 inline bool
350 needs_sign_extension(isl_format format)
351 {
352 return isl_format_has_snorm_channel(format) ||
353 isl_format_has_sint_channel(format);
354 }
355 }
356
357 namespace image_validity {
358 /**
359 * Check whether the bound image is suitable for untyped access.
360 */
361 brw_predicate
362 emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
363 brw_predicate pred)
364 {
365 const gen_device_info *devinfo = bld.shader->devinfo;
366 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
367
368 if (devinfo->gen == 7 && !devinfo->is_haswell) {
369 /* Check whether the first stride component (i.e. the Bpp value)
370 * is greater than four, what on Gen7 indicates that a surface of
371 * type RAW has been bound for untyped access. Reading or writing
372 * to a surface of type other than RAW using untyped surface
373 * messages causes a hang on IVB and VLV.
374 */
375 set_predicate(pred,
376 bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
377 BRW_CONDITIONAL_G));
378
379 return BRW_PREDICATE_NORMAL;
380 } else {
381 /* More recent generations handle the format mismatch
382 * gracefully.
383 */
384 return pred;
385 }
386 }
387
388 /**
389 * Check whether there is an image bound at the given index and write
390 * the comparison result to f0.0. Returns an appropriate predication
391 * mode to use on subsequent image operations.
392 */
393 brw_predicate
394 emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
395 {
396 const gen_device_info *devinfo = bld.shader->devinfo;
397 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
398
399 if (devinfo->gen == 7 && !devinfo->is_haswell) {
400 /* Check the first component of the size field to find out if the
401 * image is bound. Necessary on IVB for typed atomics because
402 * they don't seem to respect null surfaces and will happily
403 * corrupt or read random memory when no image is bound.
404 */
405 bld.CMP(bld.null_reg_ud(),
406 retype(size, BRW_REGISTER_TYPE_UD),
407 brw_imm_d(0), BRW_CONDITIONAL_NZ);
408
409 return BRW_PREDICATE_NORMAL;
410 } else {
411 /* More recent platforms implement compliant behavior when a null
412 * surface is bound.
413 */
414 return BRW_PREDICATE_NONE;
415 }
416 }
417
418 /**
419 * Check whether the provided coordinates are within the image bounds
420 * and write the comparison result to f0.0. Returns an appropriate
421 * predication mode to use on subsequent image operations.
422 */
423 brw_predicate
424 emit_bounds_check(const fs_builder &bld, const fs_reg &image,
425 const fs_reg &addr, unsigned dims)
426 {
427 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
428
429 for (unsigned c = 0; c < dims; ++c)
430 set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
431 bld.CMP(bld.null_reg_ud(),
432 offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
433 offset(size, bld, c),
434 BRW_CONDITIONAL_L));
435
436 return BRW_PREDICATE_NORMAL;
437 }
438 }
439
440 namespace image_coordinates {
441 /**
442 * Return the total number of coordinates needed to address a texel of
443 * the surface, which may be more than the sum of \p surf_dims and \p
444 * arr_dims if padding is required.
445 */
446 unsigned
447 num_image_coordinates(const fs_builder &bld,
448 unsigned surf_dims, unsigned arr_dims,
449 isl_format format)
450 {
451 /* HSW in vec4 mode and our software coordinate handling for untyped
452 * reads want the array index to be at the Z component.
453 */
454 const bool array_index_at_z =
455 format != ISL_FORMAT_UNSUPPORTED &&
456 !isl_has_matching_typed_storage_image_format(
457 bld.shader->devinfo, format);
458 const unsigned zero_dims =
459 ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
460
461 return surf_dims + zero_dims + arr_dims;
462 }
463
464 /**
465 * Transform image coordinates into the form expected by the
466 * implementation.
467 */
468 fs_reg
469 emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
470 unsigned surf_dims, unsigned arr_dims,
471 isl_format format)
472 {
473 const unsigned dims =
474 num_image_coordinates(bld, surf_dims, arr_dims, format);
475
476 if (dims > surf_dims + arr_dims) {
477 assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
478 /* The array index is required to be passed in as the Z component,
479 * insert a zero at the Y component to shift it to the right
480 * position.
481 *
482 * FINISHME: Factor out this frequently recurring pattern into a
483 * helper function.
484 */
485 const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
486 const fs_reg dst = bld.vgrf(addr.type, dims);
487 bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
488 return dst;
489 } else {
490 return addr;
491 }
492 }
493
494 /**
495 * Calculate the offset in memory of the texel given by \p coord.
496 *
497 * This is meant to be used with untyped surface messages to access a
498 * tiled surface, what involves taking into account the tiling and
499 * swizzling modes of the surface manually so it will hopefully not
500 * happen very often.
501 *
502 * The tiling algorithm implemented here matches either the X or Y
503 * tiling layouts supported by the hardware depending on the tiling
504 * coefficients passed to the program as uniforms. See Volume 1 Part 2
505 * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
506 * explanation of the hardware tiling format.
507 */
508 fs_reg
509 emit_address_calculation(const fs_builder &bld, const fs_reg &image,
510 const fs_reg &coord, unsigned dims)
511 {
512 const gen_device_info *devinfo = bld.shader->devinfo;
513 const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
514 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
515 const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
516 const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
517 const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
518 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
519 const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
520 const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
521 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
522
523 /* Shift the coordinates by the fixed surface offset. It may be
524 * non-zero if the image is a single slice of a higher-dimensional
525 * surface, or if a non-zero mipmap level of the surface is bound to
526 * the pipeline. The offset needs to be applied here rather than at
527 * surface state set-up time because the desired slice-level may
528 * start mid-tile, so simply shifting the surface base address
529 * wouldn't give a well-formed tiled surface in the general case.
530 */
531 for (unsigned c = 0; c < 2; ++c)
532 bld.ADD(offset(addr, bld, c), offset(off, bld, c),
533 (c < dims ?
534 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
535 fs_reg(brw_imm_d(0))));
536
537 /* The layout of 3-D textures in memory is sort-of like a tiling
538 * format. At each miplevel, the slices are arranged in rows of
539 * 2^level slices per row. The slice row is stored in tmp.y and
540 * the slice within the row is stored in tmp.x.
541 *
542 * The layout of 2-D array textures and cubemaps is much simpler:
543 * Depending on whether the ARYSPC_LOD0 layout is in use it will be
544 * stored in memory as an array of slices, each one being a 2-D
545 * arrangement of miplevels, or as a 2D arrangement of miplevels,
546 * each one being an array of slices. In either case the separation
547 * between slices of the same LOD is equal to the qpitch value
548 * provided as stride.w.
549 *
550 * This code can be made to handle either 2D arrays and 3D textures
551 * by passing in the miplevel as tile.z for 3-D textures and 0 in
552 * tile.z for 2-D array textures.
553 *
554 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
555 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
556 * of the hardware 3D texture and 2D array layouts.
557 */
558 if (dims > 2) {
559 /* Decompose z into a major (tmp.y) and a minor (tmp.x)
560 * index.
561 */
562 bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
563 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
564 bld.SHR(offset(tmp, bld, 1),
565 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
566 offset(tile, bld, 2));
567
568 /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
569 * slice offset.
570 */
571 for (unsigned c = 0; c < 2; ++c) {
572 bld.MUL(offset(tmp, bld, c),
573 offset(stride, bld, 2 + c), offset(tmp, bld, c));
574 bld.ADD(offset(addr, bld, c),
575 offset(addr, bld, c), offset(tmp, bld, c));
576 }
577 }
578
579 if (dims > 1) {
580 /* Calculate the major/minor x and y indices. In order to
581 * accommodate both X and Y tiling, the Y-major tiling format is
582 * treated as being a bunch of narrow X-tiles placed next to each
583 * other. This means that the tile width for Y-tiling is actually
584 * the width of one sub-column of the Y-major tile where each 4K
585 * tile has 8 512B sub-columns.
586 *
587 * The major Y value is the row of tiles in which the pixel lives.
588 * The major X value is the tile sub-column in which the pixel
589 * lives; for X tiling, this is the same as the tile column, for Y
590 * tiling, each tile has 8 sub-columns. The minor X and Y indices
591 * are the position within the sub-column.
592 */
593 for (unsigned c = 0; c < 2; ++c) {
594 /* Calculate the minor x and y indices. */
595 bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
596 brw_imm_d(0), offset(addr, bld, c));
597
598 /* Calculate the major x and y indices. */
599 bld.SHR(offset(major, bld, c),
600 offset(addr, bld, c), offset(tile, bld, c));
601 }
602
603 /* Calculate the texel index from the start of the tile row and
604 * the vertical coordinate of the row.
605 * Equivalent to:
606 * tmp.x = (major.x << tile.y << tile.x) +
607 * (minor.y << tile.x) + minor.x
608 * tmp.y = major.y << tile.y
609 */
610 bld.SHL(tmp, major, offset(tile, bld, 1));
611 bld.ADD(tmp, tmp, offset(minor, bld, 1));
612 bld.SHL(tmp, tmp, offset(tile, bld, 0));
613 bld.ADD(tmp, tmp, minor);
614 bld.SHL(offset(tmp, bld, 1),
615 offset(major, bld, 1), offset(tile, bld, 1));
616
617 /* Add it to the start of the tile row. */
618 bld.MUL(offset(tmp, bld, 1),
619 offset(tmp, bld, 1), offset(stride, bld, 1));
620 bld.ADD(tmp, tmp, offset(tmp, bld, 1));
621
622 /* Multiply by the Bpp value. */
623 bld.MUL(dst, tmp, stride);
624
625 if (devinfo->gen < 8 && !devinfo->is_baytrail) {
626 /* Take into account the two dynamically specified shifts.
627 * Both need are used to implement swizzling of X-tiled
628 * surfaces. For Y-tiled surfaces only one bit needs to be
629 * XOR-ed with bit 6 of the memory address, so a swz value of
630 * 0xff (actually interpreted as 31 by the hardware) will be
631 * provided to cause the relevant bit of tmp.y to be zero and
632 * turn the first XOR into the identity. For linear surfaces
633 * or platforms lacking address swizzling both shifts will be
634 * 0xff causing the relevant bits of both tmp.x and .y to be
635 * zero, what effectively disables swizzling.
636 */
637 for (unsigned c = 0; c < 2; ++c)
638 bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
639
640 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
641 bld.XOR(tmp, tmp, offset(tmp, bld, 1));
642 bld.AND(tmp, tmp, brw_imm_d(1 << 6));
643 bld.XOR(dst, dst, tmp);
644 }
645
646 } else {
647 /* Multiply by the Bpp/stride value. Note that the addr.y may be
648 * non-zero even if the image is one-dimensional because a
649 * vertical offset may have been applied above to select a
650 * non-zero slice or level of a higher-dimensional texture.
651 */
652 bld.MUL(offset(addr, bld, 1),
653 offset(addr, bld, 1), offset(stride, bld, 1));
654 bld.ADD(addr, addr, offset(addr, bld, 1));
655 bld.MUL(dst, addr, stride);
656 }
657
658 return dst;
659 }
660 }
661
662 namespace image_format_conversion {
663 using image_format_info::color_u;
664
665 namespace {
666 /**
667 * Maximum representable value in an unsigned integer with the given
668 * number of bits.
669 */
670 inline unsigned
671 scale(unsigned n)
672 {
673 return (1 << n) - 1;
674 }
675 }
676
677 /**
678 * Pack the vector \p src in a bitfield given the per-component bit
679 * shifts and widths. Note that bitfield components are not allowed to
680 * cross 32-bit boundaries.
681 */
682 fs_reg
683 emit_pack(const fs_builder &bld, const fs_reg &src,
684 const color_u &shifts, const color_u &widths)
685 {
686 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
687 bool seen[4] = {};
688
689 for (unsigned c = 0; c < 4; ++c) {
690 if (widths[c]) {
691 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
692
693 /* Shift each component left to the correct bitfield position. */
694 bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
695
696 /* Add everything up. */
697 if (seen[shifts[c] / 32]) {
698 bld.OR(offset(dst, bld, shifts[c] / 32),
699 offset(dst, bld, shifts[c] / 32), tmp);
700 } else {
701 bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
702 seen[shifts[c] / 32] = true;
703 }
704 }
705 }
706
707 return dst;
708 }
709
710 /**
711 * Unpack a vector from the bitfield \p src given the per-component bit
712 * shifts and widths. Note that bitfield components are not allowed to
713 * cross 32-bit boundaries.
714 */
715 fs_reg
716 emit_unpack(const fs_builder &bld, const fs_reg &src,
717 const color_u &shifts, const color_u &widths)
718 {
719 const fs_reg dst = bld.vgrf(src.type, 4);
720
721 for (unsigned c = 0; c < 4; ++c) {
722 if (widths[c]) {
723 /* Shift left to discard the most significant bits. */
724 bld.SHL(offset(dst, bld, c),
725 offset(src, bld, shifts[c] / 32),
726 brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
727
728 /* Shift back to the least significant bits using an arithmetic
729 * shift to get sign extension on signed types.
730 */
731 bld.ASR(offset(dst, bld, c),
732 offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
733 }
734 }
735
736 return dst;
737 }
738
739 /**
740 * Convert an integer vector into another integer vector of the
741 * specified bit widths, properly handling overflow.
742 */
743 fs_reg
744 emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
745 const color_u &widths, bool is_signed)
746 {
747 const unsigned s = (is_signed ? 1 : 0);
748 const fs_reg dst = bld.vgrf(
749 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
750 assert(src.type == dst.type);
751
752 for (unsigned c = 0; c < 4; ++c) {
753 if (widths[c]) {
754 /* Clamp to the maximum value. */
755 bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
756 brw_imm_d((int)scale(widths[c] - s)),
757 BRW_CONDITIONAL_L);
758
759 /* Clamp to the minimum value. */
760 if (is_signed)
761 bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
762 brw_imm_d(-(int)scale(widths[c] - s) - 1),
763 BRW_CONDITIONAL_GE);
764
765 /* Mask off all but the bits we actually want. Otherwise, if
766 * we pass a negative number into the hardware when it's
767 * expecting something like UINT8, it will happily clamp it to
768 * +255 for us.
769 */
770 if (is_signed && widths[c] < 32)
771 bld.AND(offset(dst, bld, c), offset(dst, bld, c),
772 brw_imm_d(scale(widths[c])));
773 }
774 }
775
776 return dst;
777 }
778
779 /**
780 * Convert a normalized fixed-point vector of the specified signedness
781 * and bit widths into a floating point vector.
782 */
783 fs_reg
784 emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
785 const color_u &widths, bool is_signed)
786 {
787 const unsigned s = (is_signed ? 1 : 0);
788 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
789
790 for (unsigned c = 0; c < 4; ++c) {
791 if (widths[c]) {
792 /* Convert to float. */
793 bld.MOV(offset(dst, bld, c), offset(src, bld, c));
794
795 /* Divide by the normalization constants. */
796 bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
797 brw_imm_f(1.0f / scale(widths[c] - s)));
798
799 /* Clamp to the minimum value. */
800 if (is_signed)
801 bld.emit_minmax(offset(dst, bld, c),
802 offset(dst, bld, c), brw_imm_f(-1.0f),
803 BRW_CONDITIONAL_GE);
804 }
805 }
806 return dst;
807 }
808
809 /**
810 * Convert a floating-point vector into a normalized fixed-point vector
811 * of the specified signedness and bit widths.
812 */
813 fs_reg
814 emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
815 const color_u &widths, bool is_signed)
816 {
817 const unsigned s = (is_signed ? 1 : 0);
818 const fs_reg dst = bld.vgrf(
819 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
820 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
821
822 for (unsigned c = 0; c < 4; ++c) {
823 if (widths[c]) {
824 /* Clamp the normalized floating-point argument. */
825 if (is_signed) {
826 bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
827 brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
828
829 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
830 brw_imm_f(1.0f), BRW_CONDITIONAL_L);
831 } else {
832 set_saturate(true, bld.MOV(offset(fdst, bld, c),
833 offset(src, bld, c)));
834 }
835
836 /* Multiply by the normalization constants. */
837 bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
838 brw_imm_f((float)scale(widths[c] - s)));
839
840 /* Convert to integer. */
841 bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
842 bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
843
844 /* Mask off all but the bits we actually want. Otherwise, if
845 * we pass a negative number into the hardware when it's
846 * expecting something like UINT8, it will happily clamp it to
847 * +255 for us.
848 */
849 if (is_signed && widths[c] < 32)
850 bld.AND(offset(dst, bld, c), offset(dst, bld, c),
851 brw_imm_d(scale(widths[c])));
852 }
853 }
854
855 return dst;
856 }
857
858 /**
859 * Convert a floating point vector of the specified bit widths into a
860 * 32-bit floating point vector.
861 */
862 fs_reg
863 emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
864 const color_u &widths)
865 {
866 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
867 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
868
869 for (unsigned c = 0; c < 4; ++c) {
870 if (widths[c]) {
871 bld.MOV(offset(dst, bld, c), offset(src, bld, c));
872
873 /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
874 * This works because they have a 5-bit exponent just like the
875 * 16-bit floating point format, and they have no sign bit.
876 */
877 if (widths[c] < 16)
878 bld.SHL(offset(dst, bld, c),
879 offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
880
881 /* Convert to 32-bit floating point. */
882 bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
883 }
884 }
885
886 return fdst;
887 }
888
889 /**
890 * Convert a vector into a floating point vector of the specified bit
891 * widths.
892 */
893 fs_reg
894 emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
895 const color_u &widths)
896 {
897 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
898 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
899
900 for (unsigned c = 0; c < 4; ++c) {
901 if (widths[c]) {
902 bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
903
904 /* Clamp to the minimum value. */
905 if (widths[c] < 16)
906 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
907 brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
908
909 /* Convert to 16-bit floating-point. */
910 bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
911
912 /* Discard the least significant bits to get floating point
913 * numbers of the requested width. This works because the
914 * 10-bit and 11-bit floating point formats have a 5-bit
915 * exponent just like the 16-bit format, and they have no sign
916 * bit.
917 */
918 if (widths[c] < 16)
919 bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
920 brw_imm_ud(15 - widths[c]));
921 }
922 }
923
924 return dst;
925 }
926
927 /**
928 * Fill missing components of a vector with 0, 0, 0, 1.
929 */
930 fs_reg
931 emit_pad(const fs_builder &bld, const fs_reg &src,
932 const color_u &widths)
933 {
934 const fs_reg dst = bld.vgrf(src.type, 4);
935 const unsigned pad[] = { 0, 0, 0, 1 };
936
937 for (unsigned c = 0; c < 4; ++c)
938 bld.MOV(offset(dst, bld, c),
939 widths[c] ? offset(src, bld, c)
940 : fs_reg(brw_imm_ud(pad[c])));
941
942 return dst;
943 }
944 }
945 }
946
947 namespace brw {
948 namespace image_access {
949 /**
950 * Load a vector from a surface of the given format and dimensionality
951 * at the given coordinates. \p surf_dims and \p arr_dims give the
952 * number of non-array and array coordinates of the image respectively.
953 */
954 fs_reg
955 emit_image_load(const fs_builder &bld,
956 const fs_reg &image, const fs_reg &addr,
957 unsigned surf_dims, unsigned arr_dims,
958 unsigned gl_format)
959 {
960 using namespace image_format_info;
961 using namespace image_format_conversion;
962 using namespace image_validity;
963 using namespace image_coordinates;
964 using namespace surface_access;
965 const gen_device_info *devinfo = bld.shader->devinfo;
966 const isl_format format = isl_format_for_gl_format(gl_format);
967 const isl_format lower_format =
968 isl_lower_storage_image_format(devinfo, format);
969 fs_reg tmp;
970
971 /* Transform the image coordinates into actual surface coordinates. */
972 const fs_reg saddr =
973 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
974 const unsigned dims =
975 num_image_coordinates(bld, surf_dims, arr_dims, format);
976
977 if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
978 /* Hopefully we get here most of the time... */
979 tmp = emit_typed_read(bld, image, saddr, dims,
980 isl_format_get_num_channels(lower_format));
981 } else {
982 /* Untyped surface reads return 32 bits of the surface per
983 * component, without any sort of unpacking or type conversion,
984 */
985 const unsigned size = isl_format_get_layout(format)->bpb / 32;
986 /* they don't properly handle out of bounds access, so we have to
987 * check manually if the coordinates are valid and predicate the
988 * surface read on the result,
989 */
990 const brw_predicate pred =
991 emit_untyped_image_check(bld, image,
992 emit_bounds_check(bld, image,
993 saddr, dims));
994
995 /* and they don't know about surface coordinates, we need to
996 * convert them to a raw memory offset.
997 */
998 const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
999
1000 tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
1001
1002 /* An out of bounds surface access should give zero as result. */
1003 for (unsigned c = 0; c < size; ++c)
1004 set_predicate(pred, bld.SEL(offset(tmp, bld, c),
1005 offset(tmp, bld, c), brw_imm_d(0)));
1006 }
1007
1008 /* Set the register type to D instead of UD if the data type is
1009 * represented as a signed integer in memory so that sign extension
1010 * is handled correctly by unpack.
1011 */
1012 if (needs_sign_extension(format))
1013 tmp = retype(tmp, BRW_REGISTER_TYPE_D);
1014
1015 if (!has_supported_bit_layout(devinfo, format)) {
1016 /* Unpack individual vector components from the bitfield if the
1017 * hardware is unable to do it for us.
1018 */
1019 if (has_split_bit_layout(devinfo, format))
1020 tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
1021 get_bit_widths(lower_format));
1022 else
1023 tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
1024 get_bit_widths(format));
1025
1026 } else if ((needs_sign_extension(format) &&
1027 !is_conversion_trivial(devinfo, format)) ||
1028 has_undefined_high_bits(devinfo, format)) {
1029 /* Perform a trivial unpack even though the bit layout matches in
1030 * order to get the most significant bits of each component
1031 * initialized properly.
1032 */
1033 tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
1034 get_bit_widths(format));
1035 }
1036
1037 if (!isl_format_has_int_channel(format)) {
1038 if (is_conversion_trivial(devinfo, format)) {
1039 /* Just need to cast the vector to the target type. */
1040 tmp = retype(tmp, BRW_REGISTER_TYPE_F);
1041 } else {
1042 /* Do the right sort of type conversion to float. */
1043 if (isl_format_has_float_channel(format))
1044 tmp = emit_convert_from_float(
1045 bld, tmp, get_bit_widths(format));
1046 else
1047 tmp = emit_convert_from_scaled(
1048 bld, tmp, get_bit_widths(format),
1049 isl_format_has_snorm_channel(format));
1050 }
1051 }
1052
1053 /* Initialize missing components of the result. */
1054 return emit_pad(bld, tmp, get_bit_widths(format));
1055 }
1056
1057 /**
1058 * Store a vector in a surface of the given format and dimensionality at
1059 * the given coordinates. \p surf_dims and \p arr_dims give the number
1060 * of non-array and array coordinates of the image respectively.
1061 */
1062 void
1063 emit_image_store(const fs_builder &bld, const fs_reg &image,
1064 const fs_reg &addr, const fs_reg &src,
1065 unsigned surf_dims, unsigned arr_dims,
1066 unsigned gl_format)
1067 {
1068 using namespace image_format_info;
1069 using namespace image_format_conversion;
1070 using namespace image_validity;
1071 using namespace image_coordinates;
1072 using namespace surface_access;
1073 const isl_format format = isl_format_for_gl_format(gl_format);
1074 const gen_device_info *devinfo = bld.shader->devinfo;
1075
1076 /* Transform the image coordinates into actual surface coordinates. */
1077 const fs_reg saddr =
1078 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1079 const unsigned dims =
1080 num_image_coordinates(bld, surf_dims, arr_dims, format);
1081
1082 if (gl_format == GL_NONE) {
1083 /* We don't know what the format is, but that's fine because it
1084 * implies write-only access, and typed surface writes are always
1085 * able to take care of type conversion and packing for us.
1086 */
1087 emit_typed_write(bld, image, saddr, src, dims, 4);
1088
1089 } else {
1090 const isl_format lower_format =
1091 isl_lower_storage_image_format(devinfo, format);
1092 fs_reg tmp = src;
1093
1094 if (!is_conversion_trivial(devinfo, format)) {
1095 /* Do the right sort of type conversion. */
1096 if (isl_format_has_float_channel(format))
1097 tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1098
1099 else if (isl_format_has_int_channel(format))
1100 tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1101 isl_format_has_sint_channel(format));
1102
1103 else
1104 tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1105 isl_format_has_snorm_channel(format));
1106 }
1107
1108 /* We're down to bit manipulation at this point. */
1109 tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1110
1111 if (!has_supported_bit_layout(devinfo, format)) {
1112 /* Pack the vector components into a bitfield if the hardware
1113 * is unable to do it for us.
1114 */
1115 if (has_split_bit_layout(devinfo, format))
1116 tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1117 get_bit_widths(lower_format));
1118
1119 else
1120 tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1121 get_bit_widths(format));
1122 }
1123
1124 if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1125 /* Hopefully we get here most of the time... */
1126 emit_typed_write(bld, image, saddr, tmp, dims,
1127 isl_format_get_num_channels(lower_format));
1128
1129 } else {
1130 /* Untyped surface writes store 32 bits of the surface per
1131 * component, without any sort of packing or type conversion,
1132 */
1133 const unsigned size = isl_format_get_layout(format)->bpb / 32;
1134
1135 /* they don't properly handle out of bounds access, so we have
1136 * to check manually if the coordinates are valid and predicate
1137 * the surface write on the result,
1138 */
1139 const brw_predicate pred =
1140 emit_untyped_image_check(bld, image,
1141 emit_bounds_check(bld, image,
1142 saddr, dims));
1143
1144 /* and, phew, they don't know about surface coordinates, we
1145 * need to convert them to a raw memory offset.
1146 */
1147 const fs_reg laddr = emit_address_calculation(
1148 bld, image, saddr, dims);
1149
1150 emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1151 }
1152 }
1153 }
1154
1155 /**
1156 * Perform an atomic read-modify-write operation in a surface of the
1157 * given dimensionality at the given coordinates. \p surf_dims and \p
1158 * arr_dims give the number of non-array and array coordinates of the
1159 * image respectively. Main building block of the imageAtomic GLSL
1160 * built-ins.
1161 */
1162 fs_reg
1163 emit_image_atomic(const fs_builder &bld,
1164 const fs_reg &image, const fs_reg &addr,
1165 const fs_reg &src0, const fs_reg &src1,
1166 unsigned surf_dims, unsigned arr_dims,
1167 unsigned rsize, unsigned op)
1168 {
1169 using namespace image_validity;
1170 using namespace image_coordinates;
1171 using namespace surface_access;
1172 /* Avoid performing an atomic operation on an unbound surface. */
1173 const brw_predicate pred = emit_typed_atomic_check(bld, image);
1174
1175 /* Transform the image coordinates into actual surface coordinates. */
1176 const fs_reg saddr =
1177 emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1178 ISL_FORMAT_R32_UINT);
1179 const unsigned dims =
1180 num_image_coordinates(bld, surf_dims, arr_dims,
1181 ISL_FORMAT_R32_UINT);
1182
1183 /* Thankfully we can do without untyped atomics here. */
1184 const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1185 dims, rsize, op, pred);
1186
1187 /* An unbound surface access should give zero as result. */
1188 if (rsize && pred)
1189 set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1190
1191 return retype(tmp, src0.type);
1192 }
1193 }
1194 }