2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
109 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
111 def type_has_size(type_
):
112 m
= _TYPE_SPLIT_RE
.match(type_
)
113 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
114 return m
.group('bits') is not None
116 def type_size(type_
):
117 m
= _TYPE_SPLIT_RE
.match(type_
)
118 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
119 assert m
.group('bits') is not None, \
120 'NIR type string has no bit size: "{}"'.format(type_
)
121 return int(m
.group('bits'))
123 def type_sizes(type_
):
124 if type_has_size(type_
):
125 return [type_size(type_
)]
126 elif type_
== 'bool':
128 elif type_
== 'float':
131 return [1, 8, 16, 32, 64]
133 def type_base_type(type_
):
134 m
= _TYPE_SPLIT_RE
.match(type_
)
135 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
136 return m
.group('type')
138 # Operation where the first two sources are commutative.
140 # For 2-source operations, this just mathematical commutativity. Some
141 # 3-source operations, like ffma, are only commutative in the first two
143 _2src_commutative
= "2src_commutative "
144 associative
= "associative "
146 # global dictionary of opcodes
149 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
150 is_conversion
, algebraic_properties
, const_expr
):
151 assert name
not in opcodes
152 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
153 input_types
, is_conversion
, algebraic_properties
,
156 def unop_convert(name
, out_type
, in_type
, const_expr
):
157 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
159 def unop(name
, ty
, const_expr
):
160 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
162 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
164 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
165 False, "", const_expr
)
167 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
168 reduce_expr
, final_expr
):
170 return "(" + prereduce_expr
.format(src
=src
) + ")"
172 return final_expr
.format(src
="(" + src
+ ")")
173 def reduce_(src0
, src1
):
174 return reduce_expr
.format(src0
=src0
, src1
=src1
)
175 src0
= prereduce("src0.x")
176 src1
= prereduce("src0.y")
177 src2
= prereduce("src0.z")
178 src3
= prereduce("src0.w")
179 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
180 final(reduce_(src0
, src1
)))
181 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
182 final(reduce_(reduce_(src0
, src1
), src2
)))
183 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
184 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
186 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
187 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
189 unop("mov", tuint
, "src0")
191 unop("ineg", tint
, "-src0")
192 unop("fneg", tfloat
, "-src0")
193 unop("inot", tint
, "~src0") # invert every bit of the integer
194 unop("fsign", tfloat
, ("bit_size == 64 ? " +
195 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
196 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
197 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
198 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
199 unop("fabs", tfloat
, "fabs(src0)")
200 unop("fsat", tfloat
, ("bit_size == 64 ? " +
201 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
202 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
203 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
204 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
205 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
206 unop("fexp2", tfloat
, "exp2f(src0)")
207 unop("flog2", tfloat
, "log2f(src0)")
209 # Generate all of the numeric conversion opcodes
210 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
212 dst_types
= [tfloat
, tint
]
214 dst_types
= [tfloat
, tint
, tbool
]
216 dst_types
= [tfloat
, tuint
]
217 elif src_t
== tfloat
:
218 dst_types
= [tint
, tuint
, tfloat
, tbool
]
220 for dst_t
in dst_types
:
221 for dst_bit_size
in type_sizes(dst_t
):
222 if dst_bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
223 rnd_modes
= ['_rtne', '_rtz', '']
224 for rnd_mode
in rnd_modes
:
225 if rnd_mode
== '_rtne':
228 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
233 elif rnd_mode
== '_rtz':
236 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
244 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0],
248 dst_t
+ str(dst_bit_size
),
250 elif dst_bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
252 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
253 dst = _mesa_double_to_float_rtz(src0);
258 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
260 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
262 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
263 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
265 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
268 # Unary floating-point rounding operations.
271 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
272 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
273 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
274 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
275 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
277 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
279 # Trigonometric operations.
282 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
283 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
286 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
287 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
289 # Partial derivatives.
292 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
293 unop("fddy", tfloat
, "0.0")
294 unop("fddx_fine", tfloat
, "0.0")
295 unop("fddy_fine", tfloat
, "0.0")
296 unop("fddx_coarse", tfloat
, "0.0")
297 unop("fddy_coarse", tfloat
, "0.0")
300 # Floating point pack and unpack operations.
303 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
304 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
305 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
306 """.replace("fmt", fmt
))
309 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
310 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
311 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
313 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
314 """.replace("fmt", fmt
))
316 def unpack_2x16(fmt
):
317 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
318 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
319 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
320 """.replace("fmt", fmt
))
323 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
324 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
325 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
326 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
327 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
328 """.replace("fmt", fmt
))
342 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
343 dst.x = (src0.x & 0xffff) | (src0.y << 16);
346 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
347 dst.x = (src0.x << 0) |
353 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
354 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
356 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
357 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
359 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
360 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
362 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
363 "dst.x = src0.x; dst.y = src0.x >> 32;")
365 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
366 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
368 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
369 "dst.x = src0.x; dst.y = src0.x >> 16;")
371 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32
, 1, tuint32
, """
372 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
373 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
376 # Lowered floating point unpacking operations.
378 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
379 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
380 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
381 "unpack_half_1x16((uint16_t)(src0 >> 16))")
383 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32
, tuint32
,
384 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
385 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32
, tuint32
,
386 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
388 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
389 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
391 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
392 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
394 # Bit operations, part of ARB_gpu_shader5.
397 unop("bitfield_reverse", tuint32
, """
398 /* we're not winning any awards for speed here, but that's ok */
400 for (unsigned bit = 0; bit < 32; bit++)
401 dst |= ((src0 >> bit) & 1) << (31 - bit);
403 unop_convert("bit_count", tuint32
, tuint
, """
405 for (unsigned bit = 0; bit < bit_size; bit++) {
406 if ((src0 >> bit) & 1)
411 unop_convert("ufind_msb", tint32
, tuint
, """
413 for (int bit = bit_size - 1; bit >= 0; bit--) {
414 if ((src0 >> bit) & 1) {
421 unop("ifind_msb", tint32
, """
423 for (int bit = 31; bit >= 0; bit--) {
424 /* If src0 < 0, we're looking for the first 0 bit.
425 * if src0 >= 0, we're looking for the first 1 bit.
427 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
428 (!((src0 >> bit) & 1) && (src0 < 0))) {
435 unop_convert("find_lsb", tint32
, tint
, """
437 for (unsigned bit = 0; bit < bit_size; bit++) {
438 if ((src0 >> bit) & 1) {
446 for i
in range(1, 5):
447 for j
in range(1, 5):
448 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
451 # AMD_gcn_shader extended instructions
452 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
454 float absX = fabs(src0.x);
455 float absY = fabs(src0.y);
456 float absZ = fabs(src0.z);
459 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
460 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
461 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
463 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
464 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
465 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
466 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
467 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
468 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
470 dst.x = dst.x / ma + 0.5;
471 dst.y = dst.y / ma + 0.5;
474 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
475 float absX = fabs(src0.x);
476 float absY = fabs(src0.y);
477 float absZ = fabs(src0.z);
478 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
479 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
480 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
481 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
482 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
483 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
486 # Sum of vector components
487 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
489 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
490 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
491 False, alg_props
, const_expr
)
493 def binop(name
, ty
, alg_props
, const_expr
):
494 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
496 def binop_compare(name
, ty
, alg_props
, const_expr
):
497 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
499 def binop_compare16(name
, ty
, alg_props
, const_expr
):
500 binop_convert(name
, tbool16
, ty
, alg_props
, const_expr
)
502 def binop_compare32(name
, ty
, alg_props
, const_expr
):
503 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
505 def binop_compare_all_sizes(name
, ty
, alg_props
, const_expr
):
506 binop_compare(name
, ty
, alg_props
, const_expr
)
507 binop_compare16(name
+ "16", ty
, alg_props
, const_expr
)
508 binop_compare32(name
+ "32", ty
, alg_props
, const_expr
)
510 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
511 src2_type
, const_expr
):
512 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
513 False, "", const_expr
)
515 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
516 reduce_expr
, final_expr
):
518 return final_expr
.format(src
= "(" + src
+ ")")
519 def reduce_(src0
, src1
):
520 return reduce_expr
.format(src0
=src0
, src1
=src1
)
521 def prereduce(src0
, src1
):
522 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
523 src0
= prereduce("src0.x", "src1.x")
524 src1
= prereduce("src0.y", "src1.y")
525 src2
= prereduce("src0.z", "src1.z")
526 src3
= prereduce("src0.w", "src1.w")
527 opcode(name
+ "2", output_size
, output_type
,
528 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
529 final(reduce_(src0
, src1
)))
530 opcode(name
+ "3", output_size
, output_type
,
531 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
532 final(reduce_(reduce_(src0
, src1
), src2
)))
533 opcode(name
+ "4", output_size
, output_type
,
534 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
535 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
537 def binop_reduce_all_sizes(name
, output_size
, src_type
, prereduce_expr
,
538 reduce_expr
, final_expr
):
539 binop_reduce(name
, output_size
, tbool1
, src_type
,
540 prereduce_expr
, reduce_expr
, final_expr
)
541 binop_reduce("b16" + name
[1:], output_size
, tbool16
, src_type
,
542 prereduce_expr
, reduce_expr
, final_expr
)
543 binop_reduce("b32" + name
[1:], output_size
, tbool32
, src_type
,
544 prereduce_expr
, reduce_expr
, final_expr
)
546 binop("fadd", tfloat
, _2src_commutative
+ associative
,"""
547 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
549 dst = _mesa_double_add_rtz(src0, src1);
551 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
556 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
557 binop("iadd_sat", tint
, _2src_commutative
, """
559 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
560 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
562 binop("uadd_sat", tuint
, _2src_commutative
,
563 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
564 binop("isub_sat", tint
, "", """
566 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
567 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
569 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
571 binop("fsub", tfloat
, "", """
572 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
574 dst = _mesa_double_sub_rtz(src0, src1);
576 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
581 binop("isub", tint
, "", "src0 - src1")
583 binop("fmul", tfloat
, _2src_commutative
+ associative
, """
584 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
586 dst = _mesa_double_mul_rtz(src0, src1);
588 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
593 # low 32-bits of signed/unsigned integer multiply
594 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
596 # Generate 64 bit result from 2 32 bits quantity
597 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
598 "(int64_t)src0 * (int64_t)src1")
599 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
600 "(uint64_t)src0 * (uint64_t)src1")
602 # high 32-bits of signed integer multiply
603 binop("imul_high", tint
, _2src_commutative
, """
604 if (bit_size == 64) {
605 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
606 * extension to work properly. The casts are kind-of annoying but needed
607 * to prevent compiler warnings.
609 uint32_t src0_u32[4] = {
615 uint32_t src1_u32[4] = {
621 uint32_t prod_u32[4];
622 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
623 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
625 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
629 # high 32-bits of unsigned integer multiply
630 binop("umul_high", tuint
, _2src_commutative
, """
631 if (bit_size == 64) {
632 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
633 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
634 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
635 uint32_t prod_u32[4];
636 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
637 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
639 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
643 # low 32-bits of unsigned integer multiply
644 binop("umul_low", tuint32
, _2src_commutative
, """
645 uint64_t mask = (1 << (bit_size / 2)) - 1;
646 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
650 binop("fdiv", tfloat
, "", "src0 / src1")
651 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
652 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
654 # returns a boolean representing the carry resulting from the addition of
655 # the two unsigned arguments.
657 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
659 # returns a boolean representing the borrow resulting from the subtraction
660 # of the two unsigned arguments.
662 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
664 # hadd: (a + b) >> 1 (without overflow)
665 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
666 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
667 # = 2 * (x & y) + (x & ~y) + (~x & y)
668 # = ((x & y) << 1) + (x ^ y)
670 # Since we know that the bottom bit of (x & y) << 1 is zero,
672 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
673 # = (x & y) + ((x ^ y) >> 1)
674 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
675 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
677 # rhadd: (a + b + 1) >> 1 (without overflow)
678 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
679 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
680 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
681 # = ((x | y) << 1) - (x ^ y) + 1
683 # Since we know that the bottom bit of (x & y) << 1 is zero,
685 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
686 # = (x | y) - ((x ^ y) >> 1)
687 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
688 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
690 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
692 # For signed integers, there are several different possible definitions of
693 # "modulus" or "remainder". We follow the conventions used by LLVM and
694 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
695 # operation while the imod opcode implements the more mathematical
696 # "modulus" operation. For details on the difference, see
698 # http://mathforum.org/library/drmath/view/52343.html
700 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
701 binop("imod", tint
, "",
702 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
703 " src0 % src1 : src0 % src1 + src1)")
704 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
705 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
712 # these integer-aware comparisons return a boolean (0 or ~0)
714 binop_compare_all_sizes("flt", tfloat
, "", "src0 < src1")
715 binop_compare_all_sizes("fge", tfloat
, "", "src0 >= src1")
716 binop_compare_all_sizes("feq", tfloat
, _2src_commutative
, "src0 == src1")
717 binop_compare_all_sizes("fne", tfloat
, _2src_commutative
, "src0 != src1")
718 binop_compare_all_sizes("ilt", tint
, "", "src0 < src1")
719 binop_compare_all_sizes("ige", tint
, "", "src0 >= src1")
720 binop_compare_all_sizes("ieq", tint
, _2src_commutative
, "src0 == src1")
721 binop_compare_all_sizes("ine", tint
, _2src_commutative
, "src0 != src1")
722 binop_compare_all_sizes("ult", tuint
, "", "src0 < src1")
723 binop_compare_all_sizes("uge", tuint
, "", "src0 >= src1")
725 # integer-aware GLSL-style comparisons that compare floats and ints
727 binop_reduce_all_sizes("ball_fequal", 1, tfloat
, "{src0} == {src1}",
728 "{src0} && {src1}", "{src}")
729 binop_reduce_all_sizes("bany_fnequal", 1, tfloat
, "{src0} != {src1}",
730 "{src0} || {src1}", "{src}")
731 binop_reduce_all_sizes("ball_iequal", 1, tint
, "{src0} == {src1}",
732 "{src0} && {src1}", "{src}")
733 binop_reduce_all_sizes("bany_inequal", 1, tint
, "{src0} != {src1}",
734 "{src0} || {src1}", "{src}")
736 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
738 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
739 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
740 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
741 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
743 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
744 # and false respectively
746 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
747 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
748 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
749 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
751 # SPIRV shifts are undefined for shift-operands >= bitsize,
752 # but SM5 shifts are defined to use the least significant bits, only
753 # The NIR definition is according to the SM5 specification.
754 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
755 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
756 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
757 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
758 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
759 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
761 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
762 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
763 dst = (src0 << (src1 & rotate_mask)) |
764 (src0 >> (-src1 & rotate_mask));
766 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
767 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
768 dst = (src0 >> (src1 & rotate_mask)) |
769 (src0 << (-src1 & rotate_mask));
772 # bitwise logic operators
774 # These are also used as boolean and, or, xor for hardware supporting
778 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
779 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
780 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
783 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
786 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
787 "{src0} * {src1}", "{src0} + {src1}", "{src}")
789 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
790 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
791 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
792 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
794 binop("fmin", tfloat
, "", "fmin(src0, src1)")
795 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
796 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
797 binop("fmax", tfloat
, "", "fmax(src0, src1)")
798 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
799 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
801 # Saturated vector add for 4 8bit ints.
802 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
804 for (int i = 0; i < 32; i += 8) {
805 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
809 # Saturated vector subtract for 4 8bit ints.
810 binop("ussub_4x8", tint32
, "", """
812 for (int i = 0; i < 32; i += 8) {
813 int src0_chan = (src0 >> i) & 0xff;
814 int src1_chan = (src1 >> i) & 0xff;
815 if (src0_chan > src1_chan)
816 dst |= (src0_chan - src1_chan) << i;
820 # vector min for 4 8bit ints.
821 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
823 for (int i = 0; i < 32; i += 8) {
824 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
828 # vector max for 4 8bit ints.
829 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
831 for (int i = 0; i < 32; i += 8) {
832 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
836 # unorm multiply: (a * b) / 255.
837 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
839 for (int i = 0; i < 32; i += 8) {
840 int src0_chan = (src0 >> i) & 0xff;
841 int src1_chan = (src1 >> i) & 0xff;
842 dst |= ((src0_chan * src1_chan) / 255) << i;
846 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
848 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
849 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
851 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
852 "src0 | ((uint64_t)src1 << 32)")
854 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
855 "src0 | ((uint32_t)src1 << 16)")
857 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
858 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
859 # are from the low five bits of src0 and src1, respectively.
860 binop_convert("bfm", tuint32
, tint32
, "", """
861 int bits = src0 & 0x1F;
862 int offset = src1 & 0x1F;
863 dst = ((1u << bits) - 1) << offset;
866 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
867 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
868 /* flush denormals to zero. */
870 dst = copysignf(0.0f, src0);
873 # Combines the first component of each input to make a 2-component vector.
875 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
881 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
882 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
885 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
886 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
889 def triop(name
, ty
, alg_props
, const_expr
):
890 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
891 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
892 opcode(name
, output_size
, tuint
,
893 [src1_size
, src2_size
, src3_size
],
894 [tuint
, tuint
, tuint
], False, "", const_expr
)
896 triop("ffma", tfloat
, _2src_commutative
, """
897 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
899 dst = _mesa_double_fma_rtz(src0, src1, src2);
900 else if (bit_size == 32)
901 dst = _mesa_float_fma_rtz(src0, src1, src2);
903 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
906 dst = fmaf(src0, src1, src2);
908 dst = fma(src0, src1, src2);
912 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
916 # A vector conditional select instruction (like ?:, but operating per-
917 # component on vectors). There are two versions, one for floating point
918 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
921 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
924 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
925 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
926 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
928 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
929 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
930 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
932 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
933 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
934 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
936 opcode("bcsel", 0, tuint
, [0, 0, 0],
937 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
938 opcode("b16csel", 0, tuint
, [0, 0, 0],
939 [tbool16
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
940 opcode("b32csel", 0, tuint
, [0, 0, 0],
941 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
944 triop("bfi", tuint32
, "", """
945 unsigned mask = src0, insert = src1, base = src2;
954 dst = (base & ~mask) | (insert & mask);
959 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
961 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
962 opcode("ubfe", 0, tuint32
,
963 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
964 unsigned base = src0;
965 unsigned offset = src1 & 0x1F;
966 unsigned bits = src2 & 0x1F;
969 } else if (offset + bits < 32) {
970 dst = (base << (32 - bits - offset)) >> (32 - bits);
972 dst = base >> offset;
975 opcode("ibfe", 0, tint32
,
976 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
978 unsigned offset = src1 & 0x1F;
979 unsigned bits = src2 & 0x1F;
982 } else if (offset + bits < 32) {
983 dst = (base << (32 - bits - offset)) >> (32 - bits);
985 dst = base >> offset;
989 # GLSL bitfieldExtract()
990 opcode("ubitfield_extract", 0, tuint32
,
991 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
992 unsigned base = src0;
993 int offset = src1, bits = src2;
996 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
997 dst = 0; /* undefined per the spec */
999 dst = (base >> offset) & ((1ull << bits) - 1);
1002 opcode("ibitfield_extract", 0, tint32
,
1003 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
1005 int offset = src1, bits = src2;
1008 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1011 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1015 # Combines the first component of each input to make a 3-component vector.
1017 triop_horiz("vec3", 3, 1, 1, 1, """
1023 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
1024 src4_size
, const_expr
):
1025 opcode(name
, output_size
, tuint
,
1026 [src1_size
, src2_size
, src3_size
, src4_size
],
1027 [tuint
, tuint
, tuint
, tuint
],
1028 False, "", const_expr
)
1030 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
1031 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
1032 unsigned base = src0, insert = src1;
1033 int offset = src2, bits = src3;
1036 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1039 unsigned mask = ((1ull << bits) - 1) << offset;
1040 dst = (base & ~mask) | ((insert << offset) & mask);
1044 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1051 # An integer multiply instruction for address calculation. This is
1052 # similar to imul, except that the results are undefined in case of
1053 # overflow. Overflow is defined according to the size of the variable
1054 # being dereferenced.
1056 # This relaxed definition, compared to imul, allows an optimization
1057 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1058 # sources, such that lower precision integer multiplies can be used.
1059 # This is useful on hw that has 24b or perhaps 16b integer multiply
1061 binop("amul", tint
, _2src_commutative
+ associative
, "src0 * src1")
1063 # ir3-specific instruction that maps directly to mul-add shift high mix,
1064 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1065 # multiplication (imul) on Freedreno backend..
1066 opcode("imadsh_mix16", 1, tint32
,
1067 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
1068 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1071 # ir3-specific instruction that maps directly to ir3 mad.s24.
1073 # 24b multiply into 32b result (with sign extension) plus 32b int
1074 triop("imad24_ir3", tint32
, _2src_commutative
,
1075 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1077 # 24b multiply into 32b result (with sign extension)
1078 binop("imul24", tint32
, _2src_commutative
+ associative
,
1079 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")