b931a6d351275a12c0dd4c093e182a4eb8943cc9
[mesa.git] / src / intel / compiler / brw_nir_lower_image_load_store.c
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "isl/isl.h"
25
26 #include "brw_nir.h"
27 #include "compiler/nir/nir_builder.h"
28 #include "compiler/nir/nir_format_convert.h"
29
30 /* The higher compiler layers use the GL enums for image formats even if
31 * they come in from SPIR-V or Vulkan. We need to turn them into an ISL
32 * enum before we can use them.
33 */
34 static enum isl_format
35 isl_format_for_gl_format(uint32_t gl_format)
36 {
37 switch (gl_format) {
38 case GL_R8: return ISL_FORMAT_R8_UNORM;
39 case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM;
40 case GL_R8UI: return ISL_FORMAT_R8_UINT;
41 case GL_R8I: return ISL_FORMAT_R8_SINT;
42 case GL_RG8: return ISL_FORMAT_R8G8_UNORM;
43 case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM;
44 case GL_RG8UI: return ISL_FORMAT_R8G8_UINT;
45 case GL_RG8I: return ISL_FORMAT_R8G8_SINT;
46 case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM;
47 case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM;
48 case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT;
49 case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT;
50 case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
51 case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM;
52 case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT;
53 case GL_R16: return ISL_FORMAT_R16_UNORM;
54 case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM;
55 case GL_R16F: return ISL_FORMAT_R16_FLOAT;
56 case GL_R16UI: return ISL_FORMAT_R16_UINT;
57 case GL_R16I: return ISL_FORMAT_R16_SINT;
58 case GL_RG16: return ISL_FORMAT_R16G16_UNORM;
59 case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM;
60 case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT;
61 case GL_RG16UI: return ISL_FORMAT_R16G16_UINT;
62 case GL_RG16I: return ISL_FORMAT_R16G16_SINT;
63 case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM;
64 case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM;
65 case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT;
66 case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT;
67 case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT;
68 case GL_R32F: return ISL_FORMAT_R32_FLOAT;
69 case GL_R32UI: return ISL_FORMAT_R32_UINT;
70 case GL_R32I: return ISL_FORMAT_R32_SINT;
71 case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT;
72 case GL_RG32UI: return ISL_FORMAT_R32G32_UINT;
73 case GL_RG32I: return ISL_FORMAT_R32G32_SINT;
74 case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT;
75 case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT;
76 case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT;
77 case GL_NONE: return ISL_FORMAT_UNSUPPORTED;
78 default:
79 assert(!"Invalid image format");
80 return ISL_FORMAT_UNSUPPORTED;
81 }
82 }
83
84 static nir_ssa_def *
85 _load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
86 {
87 nir_intrinsic_instr *load =
88 nir_intrinsic_instr_create(b->shader,
89 nir_intrinsic_image_deref_load_param_intel);
90 load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
91 nir_intrinsic_set_base(load, offset / 4);
92
93 switch (offset) {
94 case BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET:
95 load->num_components = 1;
96 break;
97 case BRW_IMAGE_PARAM_OFFSET_OFFSET:
98 case BRW_IMAGE_PARAM_SWIZZLING_OFFSET:
99 load->num_components = 2;
100 break;
101 case BRW_IMAGE_PARAM_TILING_OFFSET:
102 case BRW_IMAGE_PARAM_SIZE_OFFSET:
103 load->num_components = 3;
104 break;
105 case BRW_IMAGE_PARAM_STRIDE_OFFSET:
106 load->num_components = 4;
107 break;
108 default:
109 unreachable("Invalid param offset");
110 }
111 nir_ssa_dest_init(&load->instr, &load->dest,
112 load->num_components, 32, NULL);
113
114 nir_builder_instr_insert(b, &load->instr);
115 return &load->dest.ssa;
116 }
117
118 #define load_image_param(b, d, o) \
119 _load_image_param(b, d, BRW_IMAGE_PARAM_##o##_OFFSET)
120
121 static nir_ssa_def *
122 sanitize_image_coord(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *coord)
123 {
124 if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
125 glsl_sampler_type_is_array(deref->type)) {
126 /* It's easier if 1D arrays are treated like 2D arrays */
127 return nir_vec3(b, nir_channel(b, coord, 0),
128 nir_imm_int(b, 0),
129 nir_channel(b, coord, 1));
130 } else {
131 unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
132 return nir_channels(b, coord, (1 << dims) - 1);
133 }
134 }
135
136 static nir_ssa_def *
137 image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
138 nir_ssa_def *coord)
139 {
140 coord = sanitize_image_coord(b, deref, coord);
141 nir_ssa_def *size = load_image_param(b, deref, SIZE);
142
143 nir_ssa_def *cmp = nir_ilt(b, coord, size);
144 nir_ssa_def *in_bounds = nir_imm_int(b, NIR_TRUE);
145 for (unsigned i = 0; i < coord->num_components; i++)
146 in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
147
148 return in_bounds;
149 }
150
151 /** Calculate the offset in memory of the texel given by \p coord.
152 *
153 * This is meant to be used with untyped surface messages to access a tiled
154 * surface, what involves taking into account the tiling and swizzling modes
155 * of the surface manually so it will hopefully not happen very often.
156 *
157 * The tiling algorithm implemented here matches either the X or Y tiling
158 * layouts supported by the hardware depending on the tiling coefficients
159 * passed to the program as uniforms. See Volume 1 Part 2 Section 4.5
160 * "Address Tiling Function" of the IVB PRM for an in-depth explanation of
161 * the hardware tiling format.
162 */
163 static nir_ssa_def *
164 image_address(nir_builder *b, const struct gen_device_info *devinfo,
165 nir_deref_instr *deref, nir_ssa_def *coord)
166 {
167 coord = sanitize_image_coord(b, deref, coord);
168
169 nir_ssa_def *offset = load_image_param(b, deref, OFFSET);
170 nir_ssa_def *tiling = load_image_param(b, deref, TILING);
171 nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
172
173 /* Shift the coordinates by the fixed surface offset. It may be non-zero
174 * if the image is a single slice of a higher-dimensional surface, or if a
175 * non-zero mipmap level of the surface is bound to the pipeline. The
176 * offset needs to be applied here rather than at surface state set-up time
177 * because the desired slice-level may start mid-tile, so simply shifting
178 * the surface base address wouldn't give a well-formed tiled surface in
179 * the general case.
180 */
181 nir_ssa_def *xypos = (coord->num_components == 1) ?
182 nir_vec2(b, coord, nir_imm_int(b, 0)) :
183 nir_channels(b, coord, 0x3);
184 xypos = nir_iadd(b, xypos, offset);
185
186 /* The layout of 3-D textures in memory is sort-of like a tiling
187 * format. At each miplevel, the slices are arranged in rows of
188 * 2^level slices per row. The slice row is stored in tmp.y and
189 * the slice within the row is stored in tmp.x.
190 *
191 * The layout of 2-D array textures and cubemaps is much simpler:
192 * Depending on whether the ARYSPC_LOD0 layout is in use it will be
193 * stored in memory as an array of slices, each one being a 2-D
194 * arrangement of miplevels, or as a 2D arrangement of miplevels,
195 * each one being an array of slices. In either case the separation
196 * between slices of the same LOD is equal to the qpitch value
197 * provided as stride.w.
198 *
199 * This code can be made to handle either 2D arrays and 3D textures
200 * by passing in the miplevel as tile.z for 3-D textures and 0 in
201 * tile.z for 2-D array textures.
202 *
203 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
204 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
205 * of the hardware 3D texture and 2D array layouts.
206 */
207 if (coord->num_components > 2) {
208 /* Decompose z into a major (tmp.y) and a minor (tmp.x)
209 * index.
210 */
211 nir_ssa_def *z = nir_channel(b, coord, 2);
212 nir_ssa_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
213 nir_channel(b, tiling, 2));
214 nir_ssa_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
215
216 /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
217 * slice offset.
218 */
219 xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
220 nir_channels(b, stride, 0xc)));
221 }
222
223 nir_ssa_def *addr;
224 if (coord->num_components > 1) {
225 /* Calculate the major/minor x and y indices. In order to
226 * accommodate both X and Y tiling, the Y-major tiling format is
227 * treated as being a bunch of narrow X-tiles placed next to each
228 * other. This means that the tile width for Y-tiling is actually
229 * the width of one sub-column of the Y-major tile where each 4K
230 * tile has 8 512B sub-columns.
231 *
232 * The major Y value is the row of tiles in which the pixel lives.
233 * The major X value is the tile sub-column in which the pixel
234 * lives; for X tiling, this is the same as the tile column, for Y
235 * tiling, each tile has 8 sub-columns. The minor X and Y indices
236 * are the position within the sub-column.
237 */
238
239 /* Calculate the minor x and y indices. */
240 nir_ssa_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
241 nir_channels(b, tiling, 0x3));
242 nir_ssa_def *major = nir_ushr(b, xypos, nir_channels(b, tiling, 0x3));
243
244 /* Calculate the texel index from the start of the tile row and the
245 * vertical coordinate of the row.
246 * Equivalent to:
247 * tmp.x = (major.x << tile.y << tile.x) +
248 * (minor.y << tile.x) + minor.x
249 * tmp.y = major.y << tile.y
250 */
251 nir_ssa_def *idx_x, *idx_y;
252 idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
253 idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
254 idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
255 idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
256 idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
257
258 /* Add it to the start of the tile row. */
259 nir_ssa_def *idx;
260 idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
261 idx = nir_iadd(b, idx, idx_x);
262
263 /* Multiply by the Bpp value. */
264 addr = nir_imul(b, idx, nir_channel(b, stride, 0));
265
266 if (devinfo->gen < 8 && !devinfo->is_baytrail) {
267 /* Take into account the two dynamically specified shifts. Both are
268 * used to implement swizzling of X-tiled surfaces. For Y-tiled
269 * surfaces only one bit needs to be XOR-ed with bit 6 of the memory
270 * address, so a swz value of 0xff (actually interpreted as 31 by the
271 * hardware) will be provided to cause the relevant bit of tmp.y to
272 * be zero and turn the first XOR into the identity. For linear
273 * surfaces or platforms lacking address swizzling both shifts will
274 * be 0xff causing the relevant bits of both tmp.x and .y to be zero,
275 * what effectively disables swizzling.
276 */
277 nir_ssa_def *swizzle = load_image_param(b, deref, SWIZZLING);
278 nir_ssa_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
279 nir_ssa_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
280
281 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
282 nir_ssa_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
283 nir_imm_int(b, 1 << 6));
284 addr = nir_ixor(b, addr, bit);
285 }
286 } else {
287 /* Multiply by the Bpp/stride value. Note that the addr.y may be
288 * non-zero even if the image is one-dimensional because a vertical
289 * offset may have been applied above to select a non-zero slice or
290 * level of a higher-dimensional texture.
291 */
292 nir_ssa_def *idx;
293 idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
294 idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
295 addr = nir_imul(b, idx, nir_channel(b, stride, 0));
296 }
297
298 return addr;
299 }
300
301 struct format_info {
302 const struct isl_format_layout *fmtl;
303 unsigned chans;
304 unsigned bits[4];
305 };
306
307 static struct format_info
308 get_format_info(enum isl_format fmt)
309 {
310 const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
311
312 return (struct format_info) {
313 .fmtl = fmtl,
314 .chans = isl_format_get_num_channels(fmt),
315 .bits = {
316 fmtl->channels.r.bits,
317 fmtl->channels.g.bits,
318 fmtl->channels.b.bits,
319 fmtl->channels.a.bits
320 },
321 };
322 }
323
324 static nir_ssa_def *
325 nir_zero_vec(nir_builder *b, unsigned num_components)
326 {
327 nir_const_value v;
328 memset(&v, 0, sizeof(v));
329
330 return nir_build_imm(b, num_components, 32, v);
331 }
332
333 static nir_ssa_def *
334 convert_color_for_load(nir_builder *b, const struct gen_device_info *devinfo,
335 nir_ssa_def *color,
336 enum isl_format image_fmt, enum isl_format lower_fmt,
337 unsigned dest_components)
338 {
339 if (image_fmt == lower_fmt)
340 goto expand_vec;
341
342 if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
343 assert(lower_fmt == ISL_FORMAT_R32_UINT);
344 color = nir_format_unpack_11f11f10f(b, color);
345 goto expand_vec;
346 }
347
348 struct format_info image = get_format_info(image_fmt);
349 struct format_info lower = get_format_info(lower_fmt);
350
351 const bool needs_sign_extension =
352 isl_format_has_snorm_channel(image_fmt) ||
353 isl_format_has_sint_channel(image_fmt);
354
355 /* We only check the red channel to detect if we need to pack/unpack */
356 assert(image.bits[0] != lower.bits[0] ||
357 memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
358
359 if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
360 if (needs_sign_extension)
361 color = nir_format_unpack_sint(b, color, image.bits, image.chans);
362 else
363 color = nir_format_unpack_uint(b, color, image.bits, image.chans);
364 } else {
365 /* All these formats are homogeneous */
366 for (unsigned i = 1; i < image.chans; i++)
367 assert(image.bits[i] == image.bits[0]);
368
369 /* On IVB, we rely on the undocumented behavior that typed reads from
370 * surfaces of the unsupported R8 and R16 formats return useful data in
371 * their least significant bits. However, the data in the high bits is
372 * garbage so we have to discard it.
373 */
374 if (devinfo->gen == 7 && !devinfo->is_haswell &&
375 (lower_fmt == ISL_FORMAT_R16_UINT ||
376 lower_fmt == ISL_FORMAT_R8_UINT))
377 color = nir_format_mask_uvec(b, color, lower.bits);
378
379 if (image.bits[0] != lower.bits[0]) {
380 color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
381 image.bits[0]);
382 }
383
384 if (needs_sign_extension)
385 color = nir_format_sign_extend_ivec(b, color, image.bits);
386 }
387
388 switch (image.fmtl->channels.r.type) {
389 case ISL_UNORM:
390 assert(isl_format_has_uint_channel(lower_fmt));
391 color = nir_format_unorm_to_float(b, color, image.bits);
392 break;
393
394 case ISL_SNORM:
395 assert(isl_format_has_uint_channel(lower_fmt));
396 color = nir_format_snorm_to_float(b, color, image.bits);
397 break;
398
399 case ISL_SFLOAT:
400 if (image.bits[0] == 16)
401 color = nir_unpack_half_2x16_split_x(b, color);
402 break;
403
404 case ISL_UINT:
405 case ISL_SINT:
406 break;
407
408 default:
409 unreachable("Invalid image channel type");
410 }
411
412 expand_vec:
413 assert(dest_components == 1 || dest_components == 4);
414 assert(color->num_components <= dest_components);
415 if (color->num_components == dest_components)
416 return color;
417
418 nir_ssa_def *comps[4];
419 for (unsigned i = 0; i < color->num_components; i++)
420 comps[i] = nir_channel(b, color, i);
421
422 for (unsigned i = color->num_components; i < 3; i++)
423 comps[i] = nir_imm_int(b, 0);
424
425 if (color->num_components < 4) {
426 if (isl_format_has_int_channel(image_fmt))
427 comps[3] = nir_imm_int(b, 1);
428 else
429 comps[3] = nir_imm_float(b, 1);
430 }
431
432 return nir_vec(b, comps, dest_components);
433 }
434
435 static bool
436 lower_image_load_instr(nir_builder *b,
437 const struct gen_device_info *devinfo,
438 nir_intrinsic_instr *intrin)
439 {
440 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
441 nir_variable *var = nir_deref_instr_get_variable(deref);
442 const enum isl_format image_fmt =
443 isl_format_for_gl_format(var->data.image.format);
444
445 if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
446 const enum isl_format lower_fmt =
447 isl_lower_storage_image_format(devinfo, image_fmt);
448 const unsigned dest_components = intrin->num_components;
449
450 /* Use an undef to hold the uses of the load while we do the color
451 * conversion.
452 */
453 nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
454 nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
455
456 intrin->num_components = isl_format_get_num_channels(lower_fmt);
457 intrin->dest.ssa.num_components = intrin->num_components;
458
459 b->cursor = nir_after_instr(&intrin->instr);
460
461 nir_ssa_def *color = convert_color_for_load(b, devinfo,
462 &intrin->dest.ssa,
463 image_fmt, lower_fmt,
464 dest_components);
465
466 nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(color));
467 nir_instr_remove(placeholder->parent_instr);
468 } else {
469 const struct isl_format_layout *image_fmtl =
470 isl_format_get_layout(image_fmt);
471 /* We have a matching typed format for everything 32b and below */
472 assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
473 enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
474 ISL_FORMAT_R32G32_UINT :
475 ISL_FORMAT_R32G32B32A32_UINT;
476 const unsigned dest_components = intrin->num_components;
477
478 b->cursor = nir_instr_remove(&intrin->instr);
479
480 nir_ssa_def *coord = intrin->src[1].ssa;
481
482 nir_ssa_def *do_load = image_coord_is_in_bounds(b, deref, coord);
483 if (devinfo->gen == 7 && !devinfo->is_haswell) {
484 /* Check whether the first stride component (i.e. the Bpp value)
485 * is greater than four, what on Gen7 indicates that a surface of
486 * type RAW has been bound for untyped access. Reading or writing
487 * to a surface of type other than RAW using untyped surface
488 * messages causes a hang on IVB and VLV.
489 */
490 nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
491 nir_ssa_def *is_raw =
492 nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
493 do_load = nir_iand(b, do_load, is_raw);
494 }
495 nir_push_if(b, do_load);
496
497 nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
498 nir_intrinsic_instr *load =
499 nir_intrinsic_instr_create(b->shader,
500 nir_intrinsic_image_deref_load_raw_intel);
501 load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
502 load->src[1] = nir_src_for_ssa(addr);
503 load->num_components = image_fmtl->bpb / 32;
504 nir_ssa_dest_init(&load->instr, &load->dest,
505 load->num_components, 32, NULL);
506 nir_builder_instr_insert(b, &load->instr);
507
508 nir_push_else(b, NULL);
509
510 nir_ssa_def *zero = nir_zero_vec(b, load->num_components);
511
512 nir_pop_if(b, NULL);
513
514 nir_ssa_def *value = nir_if_phi(b, &load->dest.ssa, zero);
515
516 nir_ssa_def *color = convert_color_for_load(b, devinfo, value,
517 image_fmt, raw_fmt,
518 dest_components);
519
520 nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(color));
521 }
522
523 return true;
524 }
525
526 static nir_ssa_def *
527 convert_color_for_store(nir_builder *b, const struct gen_device_info *devinfo,
528 nir_ssa_def *color,
529 enum isl_format image_fmt, enum isl_format lower_fmt)
530 {
531 struct format_info image = get_format_info(image_fmt);
532 struct format_info lower = get_format_info(lower_fmt);
533
534 color = nir_channels(b, color, (1 << image.chans) - 1);
535
536 if (image_fmt == lower_fmt)
537 return color;
538
539 if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
540 assert(lower_fmt == ISL_FORMAT_R32_UINT);
541 return nir_format_pack_11f11f10f(b, color);
542 }
543
544 switch (image.fmtl->channels.r.type) {
545 case ISL_UNORM:
546 assert(isl_format_has_uint_channel(lower_fmt));
547 color = nir_format_float_to_unorm(b, color, image.bits);
548 break;
549
550 case ISL_SNORM:
551 assert(isl_format_has_uint_channel(lower_fmt));
552 color = nir_format_float_to_snorm(b, color, image.bits);
553 break;
554
555 case ISL_SFLOAT:
556 if (image.bits[0] == 16) {
557 nir_ssa_def *f16comps[4];
558 for (unsigned i = 0; i < image.chans; i++) {
559 f16comps[i] = nir_pack_half_2x16_split(b, nir_channel(b, color, i),
560 nir_imm_float(b, 0));
561 }
562 color = nir_vec(b, f16comps, image.chans);
563 }
564 break;
565
566 case ISL_UINT:
567 if (image.bits[0] < 32) {
568 nir_const_value max;
569 for (unsigned i = 0; i < image.chans; i++) {
570 assert(image.bits[i] < 32);
571 max.u32[i] = (1u << image.bits[i]) - 1;
572 }
573 color = nir_umin(b, color, nir_build_imm(b, image.chans, 32, max));
574 }
575 break;
576
577 case ISL_SINT:
578 if (image.bits[0] < 32) {
579 nir_const_value min, max;
580 for (unsigned i = 0; i < image.chans; i++) {
581 assert(image.bits[i] < 32);
582 max.i32[i] = (1 << (image.bits[i] - 1)) - 1;
583 min.i32[i] = -(1 << (image.bits[i] - 1));
584 }
585 color = nir_imin(b, color, nir_build_imm(b, image.chans, 32, max));
586 color = nir_imax(b, color, nir_build_imm(b, image.chans, 32, min));
587 }
588 break;
589
590 default:
591 unreachable("Invalid image channel type");
592 }
593
594 if (image.bits[0] < 32 &&
595 (isl_format_has_snorm_channel(image_fmt) ||
596 isl_format_has_sint_channel(image_fmt)))
597 color = nir_format_mask_uvec(b, color, image.bits);
598
599 if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
600 color = nir_format_pack_uint(b, color, image.bits, image.chans);
601 } else {
602 /* All these formats are homogeneous */
603 for (unsigned i = 1; i < image.chans; i++)
604 assert(image.bits[i] == image.bits[0]);
605
606 if (image.bits[0] != lower.bits[0]) {
607 color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
608 lower.bits[0]);
609 }
610 }
611
612 return color;
613 }
614
615 static bool
616 lower_image_store_instr(nir_builder *b,
617 const struct gen_device_info *devinfo,
618 nir_intrinsic_instr *intrin)
619 {
620 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
621 nir_variable *var = nir_deref_instr_get_variable(deref);
622
623 /* For write-only surfaces, we trust that the hardware can just do the
624 * conversion for us.
625 */
626 if (var->data.image.write_only)
627 return false;
628
629 const enum isl_format image_fmt =
630 isl_format_for_gl_format(var->data.image.format);
631
632 if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
633 const enum isl_format lower_fmt =
634 isl_lower_storage_image_format(devinfo, image_fmt);
635
636 /* Color conversion goes before the store */
637 b->cursor = nir_before_instr(&intrin->instr);
638
639 nir_ssa_def *color = convert_color_for_store(b, devinfo,
640 intrin->src[3].ssa,
641 image_fmt, lower_fmt);
642 intrin->num_components = isl_format_get_num_channels(lower_fmt);
643 nir_instr_rewrite_src(&intrin->instr, &intrin->src[3],
644 nir_src_for_ssa(color));
645 } else {
646 const struct isl_format_layout *image_fmtl =
647 isl_format_get_layout(image_fmt);
648 /* We have a matching typed format for everything 32b and below */
649 assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
650 enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
651 ISL_FORMAT_R32G32_UINT :
652 ISL_FORMAT_R32G32B32A32_UINT;
653
654 b->cursor = nir_instr_remove(&intrin->instr);
655
656 nir_ssa_def *coord = intrin->src[1].ssa;
657
658 nir_ssa_def *do_store = image_coord_is_in_bounds(b, deref, coord);
659 if (devinfo->gen == 7 && !devinfo->is_haswell) {
660 /* Check whether the first stride component (i.e. the Bpp value)
661 * is greater than four, what on Gen7 indicates that a surface of
662 * type RAW has been bound for untyped access. Reading or writing
663 * to a surface of type other than RAW using untyped surface
664 * messages causes a hang on IVB and VLV.
665 */
666 nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
667 nir_ssa_def *is_raw =
668 nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
669 do_store = nir_iand(b, do_store, is_raw);
670 }
671 nir_push_if(b, do_store);
672
673 nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
674 nir_ssa_def *color = convert_color_for_store(b, devinfo,
675 intrin->src[3].ssa,
676 image_fmt, raw_fmt);
677
678 nir_intrinsic_instr *store =
679 nir_intrinsic_instr_create(b->shader,
680 nir_intrinsic_image_deref_store_raw_intel);
681 store->src[0] = nir_src_for_ssa(&deref->dest.ssa);
682 store->src[1] = nir_src_for_ssa(addr);
683 store->src[2] = nir_src_for_ssa(color);
684 store->num_components = image_fmtl->bpb / 32;
685 nir_builder_instr_insert(b, &store->instr);
686
687 nir_pop_if(b, NULL);
688 }
689
690 return true;
691 }
692
693 static bool
694 lower_image_atomic_instr(nir_builder *b,
695 const struct gen_device_info *devinfo,
696 nir_intrinsic_instr *intrin)
697 {
698 if (devinfo->is_haswell || devinfo->gen >= 8)
699 return false;
700
701 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
702
703 b->cursor = nir_instr_remove(&intrin->instr);
704
705 /* Use an undef to hold the uses of the load conversion. */
706 nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
707 nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
708
709 /* Check the first component of the size field to find out if the
710 * image is bound. Necessary on IVB for typed atomics because
711 * they don't seem to respect null surfaces and will happily
712 * corrupt or read random memory when no image is bound.
713 */
714 nir_ssa_def *size = load_image_param(b, deref, SIZE);
715 nir_ssa_def *zero = nir_imm_int(b, 0);
716 nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
717
718 nir_builder_instr_insert(b, &intrin->instr);
719
720 nir_pop_if(b, NULL);
721
722 nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, zero);
723 nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(result));
724
725 return true;
726 }
727
728 static bool
729 lower_image_size_instr(nir_builder *b,
730 const struct gen_device_info *devinfo,
731 nir_intrinsic_instr *intrin)
732 {
733 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
734
735 b->cursor = nir_instr_remove(&intrin->instr);
736
737 nir_ssa_def *size = load_image_param(b, deref, SIZE);
738
739 nir_ssa_def *comps[4] = { NULL, NULL, NULL, NULL };
740
741 enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type);
742 unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
743 for (unsigned c = 0; c < coord_comps; c++) {
744 if (c == 1 && dim == GLSL_SAMPLER_DIM_1D) {
745 /* The array length for 1D arrays is in .z */
746 comps[1] = nir_channel(b, size, 2);
747 } else if (c == 2 && dim == GLSL_SAMPLER_DIM_CUBE) {
748 comps[2] = nir_idiv(b, nir_channel(b, size, 2), nir_imm_int(b, 6));
749 } else {
750 comps[c] = nir_channel(b, size, c);
751 }
752 }
753
754 for (unsigned c = coord_comps; c < intrin->dest.ssa.num_components; ++c)
755 comps[c] = nir_imm_int(b, 1);
756
757 nir_ssa_def *vec = nir_vec(b, comps, intrin->dest.ssa.num_components);
758 nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(vec));
759
760 return true;
761 }
762
763 bool
764 brw_nir_lower_image_load_store(nir_shader *shader,
765 const struct gen_device_info *devinfo)
766 {
767 bool progress = false;
768
769 nir_foreach_function(function, shader) {
770 if (function->impl == NULL)
771 continue;
772
773 nir_foreach_block_safe(block, function->impl) {
774 nir_builder b;
775 nir_builder_init(&b, function->impl);
776
777 nir_foreach_instr_safe(instr, block) {
778 if (instr->type != nir_instr_type_intrinsic)
779 continue;
780
781 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
782 switch (intrin->intrinsic) {
783 case nir_intrinsic_image_deref_load:
784 if (lower_image_load_instr(&b, devinfo, intrin))
785 progress = true;
786 break;
787
788 case nir_intrinsic_image_deref_store:
789 if (lower_image_store_instr(&b, devinfo, intrin))
790 progress = true;
791 break;
792
793 case nir_intrinsic_image_deref_atomic_add:
794 case nir_intrinsic_image_deref_atomic_min:
795 case nir_intrinsic_image_deref_atomic_max:
796 case nir_intrinsic_image_deref_atomic_and:
797 case nir_intrinsic_image_deref_atomic_or:
798 case nir_intrinsic_image_deref_atomic_xor:
799 case nir_intrinsic_image_deref_atomic_exchange:
800 case nir_intrinsic_image_deref_atomic_comp_swap:
801 if (lower_image_atomic_instr(&b, devinfo, intrin))
802 progress = true;
803 break;
804
805 case nir_intrinsic_image_deref_size:
806 if (lower_image_size_instr(&b, devinfo, intrin))
807 progress = true;
808 break;
809
810 default:
811 /* Nothing to do */
812 break;
813 }
814 }
815 }
816
817 nir_metadata_preserve(function->impl, nir_metadata_block_index |
818 nir_metadata_dominance);
819 }
820
821 return progress;
822 }