Revert "Revert "i965/fs: Use align1 mode on ternary instructions on Gen10+""
[mesa.git] / src / intel / compiler / brw_fs_surface_builder.cpp
1 /*
2 * Copyright © 2013-2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "isl/isl.h"
25 #include "brw_fs_surface_builder.h"
26 #include "brw_fs.h"
27
28 using namespace brw;
29
30 namespace brw {
31 namespace surface_access {
32 namespace {
33 /**
34 * Generate a logical send opcode for a surface message and return
35 * the result.
36 */
37 fs_reg
38 emit_send(const fs_builder &bld, enum opcode opcode,
39 const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
40 unsigned dims, unsigned arg, unsigned rsize,
41 brw_predicate pred = BRW_PREDICATE_NONE)
42 {
43 /* Reduce the dynamically uniform surface index to a single
44 * scalar.
45 */
46 const fs_reg usurface = bld.emit_uniformize(surface);
47 const fs_reg srcs[] = {
48 addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
49 };
50 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
51 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
52
53 inst->size_written = rsize * dst.component_size(inst->exec_size);
54 inst->predicate = pred;
55 return dst;
56 }
57 }
58
59 /**
60 * Emit an untyped surface read opcode. \p dims determines the number
61 * of components of the address and \p size the number of components of
62 * the returned value.
63 */
64 fs_reg
65 emit_untyped_read(const fs_builder &bld,
66 const fs_reg &surface, const fs_reg &addr,
67 unsigned dims, unsigned size,
68 brw_predicate pred)
69 {
70 return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
71 addr, fs_reg(), surface, dims, size, size, pred);
72 }
73
74 /**
75 * Emit an untyped surface write opcode. \p dims determines the number
76 * of components of the address and \p size the number of components of
77 * the argument.
78 */
79 void
80 emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
81 const fs_reg &addr, const fs_reg &src,
82 unsigned dims, unsigned size,
83 brw_predicate pred)
84 {
85 emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
86 addr, src, surface, dims, size, 0, pred);
87 }
88
89 /**
90 * Emit an untyped surface atomic opcode. \p dims determines the number
91 * of components of the address and \p rsize the number of components of
92 * the returned value (either zero or one).
93 */
94 fs_reg
95 emit_untyped_atomic(const fs_builder &bld,
96 const fs_reg &surface, const fs_reg &addr,
97 const fs_reg &src0, const fs_reg &src1,
98 unsigned dims, unsigned rsize, unsigned op,
99 brw_predicate pred)
100 {
101 /* FINISHME: Factor out this frequently recurring pattern into a
102 * helper function.
103 */
104 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
105 const fs_reg srcs[] = { src0, src1 };
106 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
107 bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
108
109 return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
110 addr, tmp, surface, dims, op, rsize, pred);
111 }
112
113 /**
114 * Emit a typed surface read opcode. \p dims determines the number of
115 * components of the address and \p size the number of components of the
116 * returned value.
117 */
118 fs_reg
119 emit_typed_read(const fs_builder &bld, const fs_reg &surface,
120 const fs_reg &addr, unsigned dims, unsigned size)
121 {
122 return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
123 addr, fs_reg(), surface, dims, size, size);
124 }
125
126 /**
127 * Emit a typed surface write opcode. \p dims determines the number of
128 * components of the address and \p size the number of components of the
129 * argument.
130 */
131 void
132 emit_typed_write(const fs_builder &bld, const fs_reg &surface,
133 const fs_reg &addr, const fs_reg &src,
134 unsigned dims, unsigned size)
135 {
136 emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
137 addr, src, surface, dims, size, 0);
138 }
139
140 /**
141 * Emit a typed surface atomic opcode. \p dims determines the number of
142 * components of the address and \p rsize the number of components of
143 * the returned value (either zero or one).
144 */
145 fs_reg
146 emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
147 const fs_reg &addr,
148 const fs_reg &src0, const fs_reg &src1,
149 unsigned dims, unsigned rsize, unsigned op,
150 brw_predicate pred)
151 {
152 /* FINISHME: Factor out this frequently recurring pattern into a
153 * helper function.
154 */
155 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
156 const fs_reg srcs[] = { src0, src1 };
157 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
158 bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
159
160 return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
161 addr, tmp, surface, dims, op, rsize);
162 }
163
164 fs_reg
165 emit_byte_scattered_read(const fs_builder &bld,
166 const fs_reg &surface, const fs_reg &addr,
167 unsigned dims, unsigned size,
168 unsigned bit_size, brw_predicate pred)
169 {
170 return emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
171 addr, fs_reg(), surface, dims, bit_size, size, pred);
172 }
173
174 void
175 emit_byte_scattered_write(const fs_builder &bld, const fs_reg &surface,
176 const fs_reg &addr, const fs_reg &src,
177 unsigned dims, unsigned size,
178 unsigned bit_size, brw_predicate pred)
179 {
180 emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
181 addr, src, surface, dims, bit_size, 0, pred);
182 }
183 }
184 }
185
186 namespace {
187 namespace image_format_info {
188 /* The higher compiler layers use the GL enums for image formats even if
189 * they come in from SPIR-V or Vulkan. We need to turn them into an ISL
190 * enum before we can use them.
191 */
192 static enum isl_format
193 isl_format_for_gl_format(uint32_t gl_format)
194 {
195 switch (gl_format) {
196 case GL_R8: return ISL_FORMAT_R8_UNORM;
197 case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM;
198 case GL_R8UI: return ISL_FORMAT_R8_UINT;
199 case GL_R8I: return ISL_FORMAT_R8_SINT;
200 case GL_RG8: return ISL_FORMAT_R8G8_UNORM;
201 case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM;
202 case GL_RG8UI: return ISL_FORMAT_R8G8_UINT;
203 case GL_RG8I: return ISL_FORMAT_R8G8_SINT;
204 case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM;
205 case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM;
206 case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT;
207 case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT;
208 case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
209 case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM;
210 case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT;
211 case GL_R16: return ISL_FORMAT_R16_UNORM;
212 case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM;
213 case GL_R16F: return ISL_FORMAT_R16_FLOAT;
214 case GL_R16UI: return ISL_FORMAT_R16_UINT;
215 case GL_R16I: return ISL_FORMAT_R16_SINT;
216 case GL_RG16: return ISL_FORMAT_R16G16_UNORM;
217 case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM;
218 case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT;
219 case GL_RG16UI: return ISL_FORMAT_R16G16_UINT;
220 case GL_RG16I: return ISL_FORMAT_R16G16_SINT;
221 case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM;
222 case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM;
223 case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT;
224 case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT;
225 case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT;
226 case GL_R32F: return ISL_FORMAT_R32_FLOAT;
227 case GL_R32UI: return ISL_FORMAT_R32_UINT;
228 case GL_R32I: return ISL_FORMAT_R32_SINT;
229 case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT;
230 case GL_RG32UI: return ISL_FORMAT_R32G32_UINT;
231 case GL_RG32I: return ISL_FORMAT_R32G32_SINT;
232 case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT;
233 case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT;
234 case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT;
235 case GL_NONE: return ISL_FORMAT_UNSUPPORTED;
236 default:
237 assert(!"Invalid image format");
238 return ISL_FORMAT_UNSUPPORTED;
239 }
240 }
241
242 /**
243 * Simple 4-tuple of scalars used to pass around per-color component
244 * values.
245 */
246 struct color_u {
247 color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
248 {
249 }
250
251 color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
252 r(r), g(g), b(b), a(a)
253 {
254 }
255
256 unsigned
257 operator[](unsigned i) const
258 {
259 const unsigned xs[] = { r, g, b, a };
260 return xs[i];
261 }
262
263 unsigned r, g, b, a;
264 };
265
266 /**
267 * Return the per-channel bitfield widths for a given image format.
268 */
269 inline color_u
270 get_bit_widths(isl_format format)
271 {
272 const isl_format_layout *fmtl = isl_format_get_layout(format);
273
274 return color_u(fmtl->channels.r.bits,
275 fmtl->channels.g.bits,
276 fmtl->channels.b.bits,
277 fmtl->channels.a.bits);
278 }
279
280 /**
281 * Return the per-channel bitfield shifts for a given image format.
282 */
283 inline color_u
284 get_bit_shifts(isl_format format)
285 {
286 const color_u widths = get_bit_widths(format);
287 return color_u(0, widths.r, widths.r + widths.g,
288 widths.r + widths.g + widths.b);
289 }
290
291 /**
292 * Return true if all present components have the same bit width.
293 */
294 inline bool
295 is_homogeneous(isl_format format)
296 {
297 const color_u widths = get_bit_widths(format);
298 return ((widths.g == 0 || widths.g == widths.r) &&
299 (widths.b == 0 || widths.b == widths.r) &&
300 (widths.a == 0 || widths.a == widths.r));
301 }
302
303 /**
304 * Return true if the format conversion boils down to a trivial copy.
305 */
306 inline bool
307 is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
308 {
309 return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
310 format == isl_lower_storage_image_format(devinfo, format);
311 }
312
313 /**
314 * Return true if the hardware natively supports some format with
315 * compatible bitfield layout, but possibly different data types.
316 */
317 inline bool
318 has_supported_bit_layout(const gen_device_info *devinfo,
319 isl_format format)
320 {
321 const color_u widths = get_bit_widths(format);
322 const color_u lower_widths = get_bit_widths(
323 isl_lower_storage_image_format(devinfo, format));
324
325 return (widths.r == lower_widths.r &&
326 widths.g == lower_widths.g &&
327 widths.b == lower_widths.b &&
328 widths.a == lower_widths.a);
329 }
330
331 /**
332 * Return true if we are required to spread individual components over
333 * several components of the format used by the hardware (RG32 and
334 * friends implemented as RGBA16UI).
335 */
336 inline bool
337 has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
338 {
339 const isl_format lower_format =
340 isl_lower_storage_image_format(devinfo, format);
341
342 return (isl_format_get_num_channels(format) <
343 isl_format_get_num_channels(lower_format));
344 }
345
346 /**
347 * Return true if the hardware returns garbage in the unused high bits
348 * of each component. This may happen on IVB because we rely on the
349 * undocumented behavior that typed reads from surfaces of the
350 * unsupported R8 and R16 formats return useful data in their least
351 * significant bits.
352 */
353 inline bool
354 has_undefined_high_bits(const gen_device_info *devinfo,
355 isl_format format)
356 {
357 const isl_format lower_format =
358 isl_lower_storage_image_format(devinfo, format);
359
360 return (devinfo->gen == 7 && !devinfo->is_haswell &&
361 (lower_format == ISL_FORMAT_R16_UINT ||
362 lower_format == ISL_FORMAT_R8_UINT));
363 }
364
365 /**
366 * Return true if the format represents values as signed integers
367 * requiring sign extension when unpacking.
368 */
369 inline bool
370 needs_sign_extension(isl_format format)
371 {
372 return isl_format_has_snorm_channel(format) ||
373 isl_format_has_sint_channel(format);
374 }
375 }
376
377 namespace image_validity {
378 /**
379 * Check whether the bound image is suitable for untyped access.
380 */
381 static brw_predicate
382 emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
383 brw_predicate pred)
384 {
385 const gen_device_info *devinfo = bld.shader->devinfo;
386 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
387
388 if (devinfo->gen == 7 && !devinfo->is_haswell) {
389 /* Check whether the first stride component (i.e. the Bpp value)
390 * is greater than four, what on Gen7 indicates that a surface of
391 * type RAW has been bound for untyped access. Reading or writing
392 * to a surface of type other than RAW using untyped surface
393 * messages causes a hang on IVB and VLV.
394 */
395 set_predicate(pred,
396 bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
397 BRW_CONDITIONAL_G));
398
399 return BRW_PREDICATE_NORMAL;
400 } else {
401 /* More recent generations handle the format mismatch
402 * gracefully.
403 */
404 return pred;
405 }
406 }
407
408 /**
409 * Check whether there is an image bound at the given index and write
410 * the comparison result to f0.0. Returns an appropriate predication
411 * mode to use on subsequent image operations.
412 */
413 static brw_predicate
414 emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
415 {
416 const gen_device_info *devinfo = bld.shader->devinfo;
417 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
418
419 if (devinfo->gen == 7 && !devinfo->is_haswell) {
420 /* Check the first component of the size field to find out if the
421 * image is bound. Necessary on IVB for typed atomics because
422 * they don't seem to respect null surfaces and will happily
423 * corrupt or read random memory when no image is bound.
424 */
425 bld.CMP(bld.null_reg_ud(),
426 retype(size, BRW_REGISTER_TYPE_UD),
427 brw_imm_d(0), BRW_CONDITIONAL_NZ);
428
429 return BRW_PREDICATE_NORMAL;
430 } else {
431 /* More recent platforms implement compliant behavior when a null
432 * surface is bound.
433 */
434 return BRW_PREDICATE_NONE;
435 }
436 }
437
438 /**
439 * Check whether the provided coordinates are within the image bounds
440 * and write the comparison result to f0.0. Returns an appropriate
441 * predication mode to use on subsequent image operations.
442 */
443 static brw_predicate
444 emit_bounds_check(const fs_builder &bld, const fs_reg &image,
445 const fs_reg &addr, unsigned dims)
446 {
447 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
448
449 for (unsigned c = 0; c < dims; ++c)
450 set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
451 bld.CMP(bld.null_reg_ud(),
452 offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
453 offset(size, bld, c),
454 BRW_CONDITIONAL_L));
455
456 return BRW_PREDICATE_NORMAL;
457 }
458 }
459
460 namespace image_coordinates {
461 /**
462 * Return the total number of coordinates needed to address a texel of
463 * the surface, which may be more than the sum of \p surf_dims and \p
464 * arr_dims if padding is required.
465 */
466 static unsigned
467 num_image_coordinates(const fs_builder &bld,
468 unsigned surf_dims, unsigned arr_dims,
469 isl_format format)
470 {
471 /* HSW in vec4 mode and our software coordinate handling for untyped
472 * reads want the array index to be at the Z component.
473 */
474 const bool array_index_at_z =
475 format != ISL_FORMAT_UNSUPPORTED &&
476 !isl_has_matching_typed_storage_image_format(
477 bld.shader->devinfo, format);
478 const unsigned zero_dims =
479 ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
480
481 return surf_dims + zero_dims + arr_dims;
482 }
483
484 /**
485 * Transform image coordinates into the form expected by the
486 * implementation.
487 */
488 static fs_reg
489 emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
490 unsigned surf_dims, unsigned arr_dims,
491 isl_format format)
492 {
493 const unsigned dims =
494 num_image_coordinates(bld, surf_dims, arr_dims, format);
495
496 if (dims > surf_dims + arr_dims) {
497 assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
498 /* The array index is required to be passed in as the Z component,
499 * insert a zero at the Y component to shift it to the right
500 * position.
501 *
502 * FINISHME: Factor out this frequently recurring pattern into a
503 * helper function.
504 */
505 const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
506 const fs_reg dst = bld.vgrf(addr.type, dims);
507 bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
508 return dst;
509 } else {
510 return addr;
511 }
512 }
513
514 /**
515 * Calculate the offset in memory of the texel given by \p coord.
516 *
517 * This is meant to be used with untyped surface messages to access a
518 * tiled surface, what involves taking into account the tiling and
519 * swizzling modes of the surface manually so it will hopefully not
520 * happen very often.
521 *
522 * The tiling algorithm implemented here matches either the X or Y
523 * tiling layouts supported by the hardware depending on the tiling
524 * coefficients passed to the program as uniforms. See Volume 1 Part 2
525 * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
526 * explanation of the hardware tiling format.
527 */
528 static fs_reg
529 emit_address_calculation(const fs_builder &bld, const fs_reg &image,
530 const fs_reg &coord, unsigned dims)
531 {
532 const gen_device_info *devinfo = bld.shader->devinfo;
533 const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
534 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
535 const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
536 const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
537 const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
538 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
539 const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
540 const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
541 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
542
543 /* Shift the coordinates by the fixed surface offset. It may be
544 * non-zero if the image is a single slice of a higher-dimensional
545 * surface, or if a non-zero mipmap level of the surface is bound to
546 * the pipeline. The offset needs to be applied here rather than at
547 * surface state set-up time because the desired slice-level may
548 * start mid-tile, so simply shifting the surface base address
549 * wouldn't give a well-formed tiled surface in the general case.
550 */
551 for (unsigned c = 0; c < 2; ++c)
552 bld.ADD(offset(addr, bld, c), offset(off, bld, c),
553 (c < dims ?
554 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
555 fs_reg(brw_imm_d(0))));
556
557 /* The layout of 3-D textures in memory is sort-of like a tiling
558 * format. At each miplevel, the slices are arranged in rows of
559 * 2^level slices per row. The slice row is stored in tmp.y and
560 * the slice within the row is stored in tmp.x.
561 *
562 * The layout of 2-D array textures and cubemaps is much simpler:
563 * Depending on whether the ARYSPC_LOD0 layout is in use it will be
564 * stored in memory as an array of slices, each one being a 2-D
565 * arrangement of miplevels, or as a 2D arrangement of miplevels,
566 * each one being an array of slices. In either case the separation
567 * between slices of the same LOD is equal to the qpitch value
568 * provided as stride.w.
569 *
570 * This code can be made to handle either 2D arrays and 3D textures
571 * by passing in the miplevel as tile.z for 3-D textures and 0 in
572 * tile.z for 2-D array textures.
573 *
574 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
575 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
576 * of the hardware 3D texture and 2D array layouts.
577 */
578 if (dims > 2) {
579 /* Decompose z into a major (tmp.y) and a minor (tmp.x)
580 * index.
581 */
582 bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
583 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
584 bld.SHR(offset(tmp, bld, 1),
585 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
586 offset(tile, bld, 2));
587
588 /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
589 * slice offset.
590 */
591 for (unsigned c = 0; c < 2; ++c) {
592 bld.MUL(offset(tmp, bld, c),
593 offset(stride, bld, 2 + c), offset(tmp, bld, c));
594 bld.ADD(offset(addr, bld, c),
595 offset(addr, bld, c), offset(tmp, bld, c));
596 }
597 }
598
599 if (dims > 1) {
600 /* Calculate the major/minor x and y indices. In order to
601 * accommodate both X and Y tiling, the Y-major tiling format is
602 * treated as being a bunch of narrow X-tiles placed next to each
603 * other. This means that the tile width for Y-tiling is actually
604 * the width of one sub-column of the Y-major tile where each 4K
605 * tile has 8 512B sub-columns.
606 *
607 * The major Y value is the row of tiles in which the pixel lives.
608 * The major X value is the tile sub-column in which the pixel
609 * lives; for X tiling, this is the same as the tile column, for Y
610 * tiling, each tile has 8 sub-columns. The minor X and Y indices
611 * are the position within the sub-column.
612 */
613 for (unsigned c = 0; c < 2; ++c) {
614 /* Calculate the minor x and y indices. */
615 bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
616 brw_imm_d(0), offset(addr, bld, c));
617
618 /* Calculate the major x and y indices. */
619 bld.SHR(offset(major, bld, c),
620 offset(addr, bld, c), offset(tile, bld, c));
621 }
622
623 /* Calculate the texel index from the start of the tile row and
624 * the vertical coordinate of the row.
625 * Equivalent to:
626 * tmp.x = (major.x << tile.y << tile.x) +
627 * (minor.y << tile.x) + minor.x
628 * tmp.y = major.y << tile.y
629 */
630 bld.SHL(tmp, major, offset(tile, bld, 1));
631 bld.ADD(tmp, tmp, offset(minor, bld, 1));
632 bld.SHL(tmp, tmp, offset(tile, bld, 0));
633 bld.ADD(tmp, tmp, minor);
634 bld.SHL(offset(tmp, bld, 1),
635 offset(major, bld, 1), offset(tile, bld, 1));
636
637 /* Add it to the start of the tile row. */
638 bld.MUL(offset(tmp, bld, 1),
639 offset(tmp, bld, 1), offset(stride, bld, 1));
640 bld.ADD(tmp, tmp, offset(tmp, bld, 1));
641
642 /* Multiply by the Bpp value. */
643 bld.MUL(dst, tmp, stride);
644
645 if (devinfo->gen < 8 && !devinfo->is_baytrail) {
646 /* Take into account the two dynamically specified shifts.
647 * Both need are used to implement swizzling of X-tiled
648 * surfaces. For Y-tiled surfaces only one bit needs to be
649 * XOR-ed with bit 6 of the memory address, so a swz value of
650 * 0xff (actually interpreted as 31 by the hardware) will be
651 * provided to cause the relevant bit of tmp.y to be zero and
652 * turn the first XOR into the identity. For linear surfaces
653 * or platforms lacking address swizzling both shifts will be
654 * 0xff causing the relevant bits of both tmp.x and .y to be
655 * zero, what effectively disables swizzling.
656 */
657 for (unsigned c = 0; c < 2; ++c)
658 bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
659
660 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
661 bld.XOR(tmp, tmp, offset(tmp, bld, 1));
662 bld.AND(tmp, tmp, brw_imm_d(1 << 6));
663 bld.XOR(dst, dst, tmp);
664 }
665
666 } else {
667 /* Multiply by the Bpp/stride value. Note that the addr.y may be
668 * non-zero even if the image is one-dimensional because a
669 * vertical offset may have been applied above to select a
670 * non-zero slice or level of a higher-dimensional texture.
671 */
672 bld.MUL(offset(addr, bld, 1),
673 offset(addr, bld, 1), offset(stride, bld, 1));
674 bld.ADD(addr, addr, offset(addr, bld, 1));
675 bld.MUL(dst, addr, stride);
676 }
677
678 return dst;
679 }
680 }
681
682 namespace image_format_conversion {
683 using image_format_info::color_u;
684
685 namespace {
686 /**
687 * Maximum representable value in an unsigned integer with the given
688 * number of bits.
689 */
690 inline unsigned
691 scale(unsigned n)
692 {
693 return (1 << n) - 1;
694 }
695 }
696
697 /**
698 * Pack the vector \p src in a bitfield given the per-component bit
699 * shifts and widths. Note that bitfield components are not allowed to
700 * cross 32-bit boundaries.
701 */
702 static fs_reg
703 emit_pack(const fs_builder &bld, const fs_reg &src,
704 const color_u &shifts, const color_u &widths)
705 {
706 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
707 bool seen[4] = {};
708
709 for (unsigned c = 0; c < 4; ++c) {
710 if (widths[c]) {
711 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
712
713 /* Shift each component left to the correct bitfield position. */
714 bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
715
716 /* Add everything up. */
717 if (seen[shifts[c] / 32]) {
718 bld.OR(offset(dst, bld, shifts[c] / 32),
719 offset(dst, bld, shifts[c] / 32), tmp);
720 } else {
721 bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
722 seen[shifts[c] / 32] = true;
723 }
724 }
725 }
726
727 return dst;
728 }
729
730 /**
731 * Unpack a vector from the bitfield \p src given the per-component bit
732 * shifts and widths. Note that bitfield components are not allowed to
733 * cross 32-bit boundaries.
734 */
735 static fs_reg
736 emit_unpack(const fs_builder &bld, const fs_reg &src,
737 const color_u &shifts, const color_u &widths)
738 {
739 const fs_reg dst = bld.vgrf(src.type, 4);
740
741 for (unsigned c = 0; c < 4; ++c) {
742 if (widths[c]) {
743 /* Shift left to discard the most significant bits. */
744 bld.SHL(offset(dst, bld, c),
745 offset(src, bld, shifts[c] / 32),
746 brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
747
748 /* Shift back to the least significant bits using an arithmetic
749 * shift to get sign extension on signed types.
750 */
751 bld.ASR(offset(dst, bld, c),
752 offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
753 }
754 }
755
756 return dst;
757 }
758
759 /**
760 * Convert an integer vector into another integer vector of the
761 * specified bit widths, properly handling overflow.
762 */
763 static fs_reg
764 emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
765 const color_u &widths, bool is_signed)
766 {
767 const unsigned s = (is_signed ? 1 : 0);
768 const fs_reg dst = bld.vgrf(
769 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
770 assert(src.type == dst.type);
771
772 for (unsigned c = 0; c < 4; ++c) {
773 if (widths[c]) {
774 /* Clamp to the maximum value. */
775 bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
776 brw_imm_d((int)scale(widths[c] - s)),
777 BRW_CONDITIONAL_L);
778
779 /* Clamp to the minimum value. */
780 if (is_signed)
781 bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
782 brw_imm_d(-(int)scale(widths[c] - s) - 1),
783 BRW_CONDITIONAL_GE);
784
785 /* Mask off all but the bits we actually want. Otherwise, if
786 * we pass a negative number into the hardware when it's
787 * expecting something like UINT8, it will happily clamp it to
788 * +255 for us.
789 */
790 if (is_signed && widths[c] < 32)
791 bld.AND(offset(dst, bld, c), offset(dst, bld, c),
792 brw_imm_d(scale(widths[c])));
793 }
794 }
795
796 return dst;
797 }
798
799 /**
800 * Convert a normalized fixed-point vector of the specified signedness
801 * and bit widths into a floating point vector.
802 */
803 static fs_reg
804 emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
805 const color_u &widths, bool is_signed)
806 {
807 const unsigned s = (is_signed ? 1 : 0);
808 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
809
810 for (unsigned c = 0; c < 4; ++c) {
811 if (widths[c]) {
812 /* Convert to float. */
813 bld.MOV(offset(dst, bld, c), offset(src, bld, c));
814
815 /* Divide by the normalization constants. */
816 bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
817 brw_imm_f(1.0f / scale(widths[c] - s)));
818
819 /* Clamp to the minimum value. */
820 if (is_signed)
821 bld.emit_minmax(offset(dst, bld, c),
822 offset(dst, bld, c), brw_imm_f(-1.0f),
823 BRW_CONDITIONAL_GE);
824 }
825 }
826 return dst;
827 }
828
829 /**
830 * Convert a floating-point vector into a normalized fixed-point vector
831 * of the specified signedness and bit widths.
832 */
833 static fs_reg
834 emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
835 const color_u &widths, bool is_signed)
836 {
837 const unsigned s = (is_signed ? 1 : 0);
838 const fs_reg dst = bld.vgrf(
839 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
840 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
841
842 for (unsigned c = 0; c < 4; ++c) {
843 if (widths[c]) {
844 /* Clamp the normalized floating-point argument. */
845 if (is_signed) {
846 bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
847 brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
848
849 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
850 brw_imm_f(1.0f), BRW_CONDITIONAL_L);
851 } else {
852 set_saturate(true, bld.MOV(offset(fdst, bld, c),
853 offset(src, bld, c)));
854 }
855
856 /* Multiply by the normalization constants. */
857 bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
858 brw_imm_f((float)scale(widths[c] - s)));
859
860 /* Convert to integer. */
861 bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
862 bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
863
864 /* Mask off all but the bits we actually want. Otherwise, if
865 * we pass a negative number into the hardware when it's
866 * expecting something like UINT8, it will happily clamp it to
867 * +255 for us.
868 */
869 if (is_signed && widths[c] < 32)
870 bld.AND(offset(dst, bld, c), offset(dst, bld, c),
871 brw_imm_d(scale(widths[c])));
872 }
873 }
874
875 return dst;
876 }
877
878 /**
879 * Convert a floating point vector of the specified bit widths into a
880 * 32-bit floating point vector.
881 */
882 static fs_reg
883 emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
884 const color_u &widths)
885 {
886 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
887 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
888
889 for (unsigned c = 0; c < 4; ++c) {
890 if (widths[c]) {
891 bld.MOV(offset(dst, bld, c), offset(src, bld, c));
892
893 /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
894 * This works because they have a 5-bit exponent just like the
895 * 16-bit floating point format, and they have no sign bit.
896 */
897 if (widths[c] < 16)
898 bld.SHL(offset(dst, bld, c),
899 offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
900
901 /* Convert to 32-bit floating point. */
902 bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
903 }
904 }
905
906 return fdst;
907 }
908
909 /**
910 * Convert a vector into a floating point vector of the specified bit
911 * widths.
912 */
913 static fs_reg
914 emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
915 const color_u &widths)
916 {
917 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
918 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
919
920 for (unsigned c = 0; c < 4; ++c) {
921 if (widths[c]) {
922 bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
923
924 /* Clamp to the minimum value. */
925 if (widths[c] < 16)
926 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
927 brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
928
929 /* Convert to 16-bit floating-point. */
930 bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
931
932 /* Discard the least significant bits to get floating point
933 * numbers of the requested width. This works because the
934 * 10-bit and 11-bit floating point formats have a 5-bit
935 * exponent just like the 16-bit format, and they have no sign
936 * bit.
937 */
938 if (widths[c] < 16)
939 bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
940 brw_imm_ud(15 - widths[c]));
941 }
942 }
943
944 return dst;
945 }
946
947 /**
948 * Fill missing components of a vector with 0, 0, 0, 1.
949 */
950 static fs_reg
951 emit_pad(const fs_builder &bld, const fs_reg &src,
952 const color_u &widths)
953 {
954 const fs_reg dst = bld.vgrf(src.type, 4);
955 const unsigned pad[] = { 0, 0, 0, 1 };
956
957 for (unsigned c = 0; c < 4; ++c)
958 bld.MOV(offset(dst, bld, c),
959 widths[c] ? offset(src, bld, c)
960 : fs_reg(brw_imm_ud(pad[c])));
961
962 return dst;
963 }
964 }
965 }
966
967 namespace brw {
968 namespace image_access {
969 /**
970 * Load a vector from a surface of the given format and dimensionality
971 * at the given coordinates. \p surf_dims and \p arr_dims give the
972 * number of non-array and array coordinates of the image respectively.
973 */
974 fs_reg
975 emit_image_load(const fs_builder &bld,
976 const fs_reg &image, const fs_reg &addr,
977 unsigned surf_dims, unsigned arr_dims,
978 unsigned gl_format)
979 {
980 using namespace image_format_info;
981 using namespace image_format_conversion;
982 using namespace image_validity;
983 using namespace image_coordinates;
984 using namespace surface_access;
985 const gen_device_info *devinfo = bld.shader->devinfo;
986 const isl_format format = isl_format_for_gl_format(gl_format);
987 const isl_format lower_format =
988 isl_lower_storage_image_format(devinfo, format);
989 fs_reg tmp;
990
991 /* Transform the image coordinates into actual surface coordinates. */
992 const fs_reg saddr =
993 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
994 const unsigned dims =
995 num_image_coordinates(bld, surf_dims, arr_dims, format);
996
997 if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
998 /* Hopefully we get here most of the time... */
999 tmp = emit_typed_read(bld, image, saddr, dims,
1000 isl_format_get_num_channels(lower_format));
1001 } else {
1002 /* Untyped surface reads return 32 bits of the surface per
1003 * component, without any sort of unpacking or type conversion,
1004 */
1005 const unsigned size = isl_format_get_layout(format)->bpb / 32;
1006 /* they don't properly handle out of bounds access, so we have to
1007 * check manually if the coordinates are valid and predicate the
1008 * surface read on the result,
1009 */
1010 const brw_predicate pred =
1011 emit_untyped_image_check(bld, image,
1012 emit_bounds_check(bld, image,
1013 saddr, dims));
1014
1015 /* and they don't know about surface coordinates, we need to
1016 * convert them to a raw memory offset.
1017 */
1018 const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
1019
1020 tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
1021
1022 /* An out of bounds surface access should give zero as result. */
1023 for (unsigned c = 0; c < size; ++c)
1024 set_predicate(pred, bld.SEL(offset(tmp, bld, c),
1025 offset(tmp, bld, c), brw_imm_d(0)));
1026 }
1027
1028 /* Set the register type to D instead of UD if the data type is
1029 * represented as a signed integer in memory so that sign extension
1030 * is handled correctly by unpack.
1031 */
1032 if (needs_sign_extension(format))
1033 tmp = retype(tmp, BRW_REGISTER_TYPE_D);
1034
1035 if (!has_supported_bit_layout(devinfo, format)) {
1036 /* Unpack individual vector components from the bitfield if the
1037 * hardware is unable to do it for us.
1038 */
1039 if (has_split_bit_layout(devinfo, format))
1040 tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
1041 get_bit_widths(lower_format));
1042 else
1043 tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
1044 get_bit_widths(format));
1045
1046 } else if ((needs_sign_extension(format) &&
1047 !is_conversion_trivial(devinfo, format)) ||
1048 has_undefined_high_bits(devinfo, format)) {
1049 /* Perform a trivial unpack even though the bit layout matches in
1050 * order to get the most significant bits of each component
1051 * initialized properly.
1052 */
1053 tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
1054 get_bit_widths(format));
1055 }
1056
1057 if (!isl_format_has_int_channel(format)) {
1058 if (is_conversion_trivial(devinfo, format)) {
1059 /* Just need to cast the vector to the target type. */
1060 tmp = retype(tmp, BRW_REGISTER_TYPE_F);
1061 } else {
1062 /* Do the right sort of type conversion to float. */
1063 if (isl_format_has_float_channel(format))
1064 tmp = emit_convert_from_float(
1065 bld, tmp, get_bit_widths(format));
1066 else
1067 tmp = emit_convert_from_scaled(
1068 bld, tmp, get_bit_widths(format),
1069 isl_format_has_snorm_channel(format));
1070 }
1071 }
1072
1073 /* Initialize missing components of the result. */
1074 return emit_pad(bld, tmp, get_bit_widths(format));
1075 }
1076
1077 /**
1078 * Store a vector in a surface of the given format and dimensionality at
1079 * the given coordinates. \p surf_dims and \p arr_dims give the number
1080 * of non-array and array coordinates of the image respectively.
1081 */
1082 void
1083 emit_image_store(const fs_builder &bld, const fs_reg &image,
1084 const fs_reg &addr, const fs_reg &src,
1085 unsigned surf_dims, unsigned arr_dims,
1086 unsigned gl_format)
1087 {
1088 using namespace image_format_info;
1089 using namespace image_format_conversion;
1090 using namespace image_validity;
1091 using namespace image_coordinates;
1092 using namespace surface_access;
1093 const isl_format format = isl_format_for_gl_format(gl_format);
1094 const gen_device_info *devinfo = bld.shader->devinfo;
1095
1096 /* Transform the image coordinates into actual surface coordinates. */
1097 const fs_reg saddr =
1098 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1099 const unsigned dims =
1100 num_image_coordinates(bld, surf_dims, arr_dims, format);
1101
1102 if (gl_format == GL_NONE) {
1103 /* We don't know what the format is, but that's fine because it
1104 * implies write-only access, and typed surface writes are always
1105 * able to take care of type conversion and packing for us.
1106 */
1107 emit_typed_write(bld, image, saddr, src, dims, 4);
1108
1109 } else {
1110 const isl_format lower_format =
1111 isl_lower_storage_image_format(devinfo, format);
1112 fs_reg tmp = src;
1113
1114 if (!is_conversion_trivial(devinfo, format)) {
1115 /* Do the right sort of type conversion. */
1116 if (isl_format_has_float_channel(format))
1117 tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1118
1119 else if (isl_format_has_int_channel(format))
1120 tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1121 isl_format_has_sint_channel(format));
1122
1123 else
1124 tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1125 isl_format_has_snorm_channel(format));
1126 }
1127
1128 /* We're down to bit manipulation at this point. */
1129 tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1130
1131 if (!has_supported_bit_layout(devinfo, format)) {
1132 /* Pack the vector components into a bitfield if the hardware
1133 * is unable to do it for us.
1134 */
1135 if (has_split_bit_layout(devinfo, format))
1136 tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1137 get_bit_widths(lower_format));
1138
1139 else
1140 tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1141 get_bit_widths(format));
1142 }
1143
1144 if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1145 /* Hopefully we get here most of the time... */
1146 emit_typed_write(bld, image, saddr, tmp, dims,
1147 isl_format_get_num_channels(lower_format));
1148
1149 } else {
1150 /* Untyped surface writes store 32 bits of the surface per
1151 * component, without any sort of packing or type conversion,
1152 */
1153 const unsigned size = isl_format_get_layout(format)->bpb / 32;
1154
1155 /* they don't properly handle out of bounds access, so we have
1156 * to check manually if the coordinates are valid and predicate
1157 * the surface write on the result,
1158 */
1159 const brw_predicate pred =
1160 emit_untyped_image_check(bld, image,
1161 emit_bounds_check(bld, image,
1162 saddr, dims));
1163
1164 /* and, phew, they don't know about surface coordinates, we
1165 * need to convert them to a raw memory offset.
1166 */
1167 const fs_reg laddr = emit_address_calculation(
1168 bld, image, saddr, dims);
1169
1170 emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1171 }
1172 }
1173 }
1174
1175 /**
1176 * Perform an atomic read-modify-write operation in a surface of the
1177 * given dimensionality at the given coordinates. \p surf_dims and \p
1178 * arr_dims give the number of non-array and array coordinates of the
1179 * image respectively. Main building block of the imageAtomic GLSL
1180 * built-ins.
1181 */
1182 fs_reg
1183 emit_image_atomic(const fs_builder &bld,
1184 const fs_reg &image, const fs_reg &addr,
1185 const fs_reg &src0, const fs_reg &src1,
1186 unsigned surf_dims, unsigned arr_dims,
1187 unsigned rsize, unsigned op)
1188 {
1189 using namespace image_validity;
1190 using namespace image_coordinates;
1191 using namespace surface_access;
1192 /* Avoid performing an atomic operation on an unbound surface. */
1193 const brw_predicate pred = emit_typed_atomic_check(bld, image);
1194
1195 /* Transform the image coordinates into actual surface coordinates. */
1196 const fs_reg saddr =
1197 emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1198 ISL_FORMAT_R32_UINT);
1199 const unsigned dims =
1200 num_image_coordinates(bld, surf_dims, arr_dims,
1201 ISL_FORMAT_R32_UINT);
1202
1203 /* Thankfully we can do without untyped atomics here. */
1204 const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1205 dims, rsize, op, pred);
1206
1207 /* An unbound surface access should give zero as result. */
1208 if (rsize && pred)
1209 set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1210
1211 return retype(tmp, src0.type);
1212 }
1213 }
1214 }