2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4 or (output_size
== 8) or (output_size
== 16)
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
111 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
113 def type_has_size(type_
):
114 m
= _TYPE_SPLIT_RE
.match(type_
)
115 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
116 return m
.group('bits') is not None
118 def type_size(type_
):
119 m
= _TYPE_SPLIT_RE
.match(type_
)
120 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
121 assert m
.group('bits') is not None, \
122 'NIR type string has no bit size: "{}"'.format(type_
)
123 return int(m
.group('bits'))
125 def type_sizes(type_
):
126 if type_has_size(type_
):
127 return [type_size(type_
)]
128 elif type_
== 'bool':
129 return [1, 8, 16, 32]
130 elif type_
== 'float':
133 return [1, 8, 16, 32, 64]
135 def type_base_type(type_
):
136 m
= _TYPE_SPLIT_RE
.match(type_
)
137 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
138 return m
.group('type')
140 # Operation where the first two sources are commutative.
142 # For 2-source operations, this just mathematical commutativity. Some
143 # 3-source operations, like ffma, are only commutative in the first two
145 _2src_commutative
= "2src_commutative "
146 associative
= "associative "
148 # global dictionary of opcodes
151 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
152 is_conversion
, algebraic_properties
, const_expr
):
153 assert name
not in opcodes
154 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
155 input_types
, is_conversion
, algebraic_properties
,
158 def unop_convert(name
, out_type
, in_type
, const_expr
):
159 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
161 def unop(name
, ty
, const_expr
):
162 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
164 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
166 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
167 False, "", const_expr
)
169 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
170 reduce_expr
, final_expr
):
172 return "(" + prereduce_expr
.format(src
=src
) + ")"
174 return final_expr
.format(src
="(" + src
+ ")")
175 def reduce_(src0
, src1
):
176 return reduce_expr
.format(src0
=src0
, src1
=src1
)
177 src0
= prereduce("src0.x")
178 src1
= prereduce("src0.y")
179 src2
= prereduce("src0.z")
180 src3
= prereduce("src0.w")
181 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
182 final(reduce_(src0
, src1
)))
183 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
184 final(reduce_(reduce_(src0
, src1
), src2
)))
185 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
186 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
188 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
189 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
191 unop("mov", tuint
, "src0")
193 unop("ineg", tint
, "-src0")
194 unop("fneg", tfloat
, "-src0")
195 unop("inot", tint
, "~src0") # invert every bit of the integer
196 unop("fsign", tfloat
, ("bit_size == 64 ? " +
197 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
198 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
199 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
200 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
201 unop("fabs", tfloat
, "fabs(src0)")
202 unop("fsat", tfloat
, ("bit_size == 64 ? " +
203 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
204 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
205 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
206 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
207 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
208 unop("fexp2", tfloat
, "exp2f(src0)")
209 unop("flog2", tfloat
, "log2f(src0)")
211 # Generate all of the numeric conversion opcodes
212 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
214 dst_types
= [tfloat
, tint
]
216 dst_types
= [tfloat
, tint
, tbool
]
218 dst_types
= [tfloat
, tuint
]
219 elif src_t
== tfloat
:
220 dst_types
= [tint
, tuint
, tfloat
, tbool
]
222 for dst_t
in dst_types
:
223 for dst_bit_size
in type_sizes(dst_t
):
224 if dst_bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
225 rnd_modes
= ['_rtne', '_rtz', '']
226 for rnd_mode
in rnd_modes
:
227 if rnd_mode
== '_rtne':
230 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
235 elif rnd_mode
== '_rtz':
238 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
246 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0],
250 dst_t
+ str(dst_bit_size
),
252 elif dst_bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
254 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
255 dst = _mesa_double_to_float_rtz(src0);
260 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
262 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
264 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
265 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
267 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
269 # Special opcode that is the same as f2f16 except that it is safe to remove it
270 # if the result is immediately converted back to float32 again. This is
271 # generated as part of the precision lowering pass. mp stands for medium
273 unop_numeric_convert("f2fmp", tfloat16
, tfloat
, opcodes
["f2f16"].const_expr
)
275 # Unary floating-point rounding operations.
278 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
279 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
280 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
281 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
282 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
284 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
286 # Trigonometric operations.
289 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
290 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
293 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
294 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
296 # Partial derivatives.
299 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
300 unop("fddy", tfloat
, "0.0")
301 unop("fddx_fine", tfloat
, "0.0")
302 unop("fddy_fine", tfloat
, "0.0")
303 unop("fddx_coarse", tfloat
, "0.0")
304 unop("fddy_coarse", tfloat
, "0.0")
307 # Floating point pack and unpack operations.
310 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
311 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
312 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
313 """.replace("fmt", fmt
))
316 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
317 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
318 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
319 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
320 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
321 """.replace("fmt", fmt
))
323 def unpack_2x16(fmt
):
324 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
325 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
326 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
327 """.replace("fmt", fmt
))
330 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
331 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
332 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
333 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
334 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
335 """.replace("fmt", fmt
))
349 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
350 dst.x = (src0.x & 0xffff) | (src0.y << 16);
353 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
354 dst.x = (src0.x << 0) |
360 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
361 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
363 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
364 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
366 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
367 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
369 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
370 "dst.x = src0.x; dst.y = src0.x >> 32;")
372 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
373 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
375 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
376 "dst.x = src0.x; dst.y = src0.x >> 16;")
378 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32
, 1, tuint32
, """
379 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
380 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
383 # Lowered floating point unpacking operations.
385 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
386 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
387 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
388 "unpack_half_1x16((uint16_t)(src0 >> 16))")
390 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32
, tuint32
,
391 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
392 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32
, tuint32
,
393 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
395 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
396 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
398 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
399 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
401 # Bit operations, part of ARB_gpu_shader5.
404 unop("bitfield_reverse", tuint32
, """
405 /* we're not winning any awards for speed here, but that's ok */
407 for (unsigned bit = 0; bit < 32; bit++)
408 dst |= ((src0 >> bit) & 1) << (31 - bit);
410 unop_convert("bit_count", tuint32
, tuint
, """
412 for (unsigned bit = 0; bit < bit_size; bit++) {
413 if ((src0 >> bit) & 1)
418 unop_convert("ufind_msb", tint32
, tuint
, """
420 for (int bit = bit_size - 1; bit >= 0; bit--) {
421 if ((src0 >> bit) & 1) {
428 unop("uclz", tuint32
, """
430 for (bit = bit_size - 1; bit >= 0; bit--) {
431 if ((src0 & (1u << bit)) != 0)
434 dst = (unsigned)(31 - bit);
437 unop("ifind_msb", tint32
, """
439 for (int bit = 31; bit >= 0; bit--) {
440 /* If src0 < 0, we're looking for the first 0 bit.
441 * if src0 >= 0, we're looking for the first 1 bit.
443 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
444 (!((src0 >> bit) & 1) && (src0 < 0))) {
451 unop_convert("find_lsb", tint32
, tint
, """
453 for (unsigned bit = 0; bit < bit_size; bit++) {
454 if ((src0 >> bit) & 1) {
462 for i
in range(1, 5):
463 for j
in range(1, 5):
464 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
467 # AMD_gcn_shader extended instructions
468 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
470 float absX = fabsf(src0.x);
471 float absY = fabsf(src0.y);
472 float absZ = fabsf(src0.z);
475 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
476 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
477 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
479 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
480 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
481 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
482 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
483 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
484 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
486 dst.x = dst.x / ma + 0.5;
487 dst.y = dst.y / ma + 0.5;
490 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
491 float absX = fabsf(src0.x);
492 float absY = fabsf(src0.y);
493 float absZ = fabsf(src0.z);
494 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
495 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
496 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
497 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
498 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
499 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
502 # Sum of vector components
503 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
505 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
506 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
507 False, alg_props
, const_expr
)
509 def binop(name
, ty
, alg_props
, const_expr
):
510 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
512 def binop_compare(name
, ty
, alg_props
, const_expr
):
513 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
515 def binop_compare8(name
, ty
, alg_props
, const_expr
):
516 binop_convert(name
, tbool8
, ty
, alg_props
, const_expr
)
518 def binop_compare16(name
, ty
, alg_props
, const_expr
):
519 binop_convert(name
, tbool16
, ty
, alg_props
, const_expr
)
521 def binop_compare32(name
, ty
, alg_props
, const_expr
):
522 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
524 def binop_compare_all_sizes(name
, ty
, alg_props
, const_expr
):
525 binop_compare(name
, ty
, alg_props
, const_expr
)
526 binop_compare8(name
+ "8", ty
, alg_props
, const_expr
)
527 binop_compare16(name
+ "16", ty
, alg_props
, const_expr
)
528 binop_compare32(name
+ "32", ty
, alg_props
, const_expr
)
530 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
531 src2_type
, const_expr
):
532 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
533 False, "", const_expr
)
535 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
536 reduce_expr
, final_expr
):
538 return final_expr
.format(src
= "(" + src
+ ")")
539 def reduce_(src0
, src1
):
540 return reduce_expr
.format(src0
=src0
, src1
=src1
)
541 def prereduce(src0
, src1
):
542 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
543 src0
= prereduce("src0.x", "src1.x")
544 src1
= prereduce("src0.y", "src1.y")
545 src2
= prereduce("src0.z", "src1.z")
546 src3
= prereduce("src0.w", "src1.w")
547 opcode(name
+ "2", output_size
, output_type
,
548 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
549 final(reduce_(src0
, src1
)))
550 opcode(name
+ "3", output_size
, output_type
,
551 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
552 final(reduce_(reduce_(src0
, src1
), src2
)))
553 opcode(name
+ "4", output_size
, output_type
,
554 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
555 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
557 def binop_reduce_all_sizes(name
, output_size
, src_type
, prereduce_expr
,
558 reduce_expr
, final_expr
):
559 binop_reduce(name
, output_size
, tbool1
, src_type
,
560 prereduce_expr
, reduce_expr
, final_expr
)
561 binop_reduce("b8" + name
[1:], output_size
, tbool8
, src_type
,
562 prereduce_expr
, reduce_expr
, final_expr
)
563 binop_reduce("b16" + name
[1:], output_size
, tbool16
, src_type
,
564 prereduce_expr
, reduce_expr
, final_expr
)
565 binop_reduce("b32" + name
[1:], output_size
, tbool32
, src_type
,
566 prereduce_expr
, reduce_expr
, final_expr
)
568 binop("fadd", tfloat
, _2src_commutative
+ associative
,"""
569 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
571 dst = _mesa_double_add_rtz(src0, src1);
573 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
578 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
579 binop("iadd_sat", tint
, _2src_commutative
, """
581 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
582 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
584 binop("uadd_sat", tuint
, _2src_commutative
,
585 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
586 binop("isub_sat", tint
, "", """
588 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
589 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
591 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
593 binop("fsub", tfloat
, "", """
594 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
596 dst = _mesa_double_sub_rtz(src0, src1);
598 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
603 binop("isub", tint
, "", "src0 - src1")
604 binop_convert("uabs_isub", tuint
, tint
, "", """
605 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
606 : (uint64_t) src0 - (uint64_t) src1
608 binop("uabs_usub", tuint
, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
610 binop("fmul", tfloat
, _2src_commutative
+ associative
, """
611 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
613 dst = _mesa_double_mul_rtz(src0, src1);
615 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
620 # low 32-bits of signed/unsigned integer multiply
621 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
623 # Generate 64 bit result from 2 32 bits quantity
624 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
625 "(int64_t)src0 * (int64_t)src1")
626 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
627 "(uint64_t)src0 * (uint64_t)src1")
629 # high 32-bits of signed integer multiply
630 binop("imul_high", tint
, _2src_commutative
, """
631 if (bit_size == 64) {
632 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
633 * extension to work properly. The casts are kind-of annoying but needed
634 * to prevent compiler warnings.
636 uint32_t src0_u32[4] = {
642 uint32_t src1_u32[4] = {
648 uint32_t prod_u32[4];
649 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
650 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
652 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
656 # high 32-bits of unsigned integer multiply
657 binop("umul_high", tuint
, _2src_commutative
, """
658 if (bit_size == 64) {
659 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
660 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
661 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
662 uint32_t prod_u32[4];
663 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
664 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
666 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
670 # low 32-bits of unsigned integer multiply
671 binop("umul_low", tuint32
, _2src_commutative
, """
672 uint64_t mask = (1 << (bit_size / 2)) - 1;
673 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
676 # Multiply 32-bits with low 16-bits.
677 binop("imul_32x16", tint32
, "", "src0 * (int16_t) src1")
678 binop("umul_32x16", tuint32
, "", "src0 * (uint16_t) src1")
680 binop("fdiv", tfloat
, "", "src0 / src1")
681 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
682 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
684 # returns a boolean representing the carry resulting from the addition of
685 # the two unsigned arguments.
687 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
689 # returns a boolean representing the borrow resulting from the subtraction
690 # of the two unsigned arguments.
692 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
694 # hadd: (a + b) >> 1 (without overflow)
695 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
696 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
697 # = 2 * (x & y) + (x & ~y) + (~x & y)
698 # = ((x & y) << 1) + (x ^ y)
700 # Since we know that the bottom bit of (x & y) << 1 is zero,
702 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
703 # = (x & y) + ((x ^ y) >> 1)
704 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
705 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
707 # rhadd: (a + b + 1) >> 1 (without overflow)
708 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
709 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
710 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
711 # = ((x | y) << 1) - (x ^ y) + 1
713 # Since we know that the bottom bit of (x & y) << 1 is zero,
715 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
716 # = (x | y) - ((x ^ y) >> 1)
717 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
718 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
720 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
722 # For signed integers, there are several different possible definitions of
723 # "modulus" or "remainder". We follow the conventions used by LLVM and
724 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
725 # operation while the imod opcode implements the more mathematical
726 # "modulus" operation. For details on the difference, see
728 # http://mathforum.org/library/drmath/view/52343.html
730 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
731 binop("imod", tint
, "",
732 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
733 " src0 % src1 : src0 % src1 + src1)")
734 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
735 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
742 # these integer-aware comparisons return a boolean (0 or ~0)
744 binop_compare_all_sizes("flt", tfloat
, "", "src0 < src1")
745 binop_compare_all_sizes("fge", tfloat
, "", "src0 >= src1")
746 binop_compare_all_sizes("feq", tfloat
, _2src_commutative
, "src0 == src1")
747 binop_compare_all_sizes("fne", tfloat
, _2src_commutative
, "src0 != src1")
748 binop_compare_all_sizes("ilt", tint
, "", "src0 < src1")
749 binop_compare_all_sizes("ige", tint
, "", "src0 >= src1")
750 binop_compare_all_sizes("ieq", tint
, _2src_commutative
, "src0 == src1")
751 binop_compare_all_sizes("ine", tint
, _2src_commutative
, "src0 != src1")
752 binop_compare_all_sizes("ult", tuint
, "", "src0 < src1")
753 binop_compare_all_sizes("uge", tuint
, "", "src0 >= src1")
755 # integer-aware GLSL-style comparisons that compare floats and ints
757 binop_reduce_all_sizes("ball_fequal", 1, tfloat
, "{src0} == {src1}",
758 "{src0} && {src1}", "{src}")
759 binop_reduce_all_sizes("bany_fnequal", 1, tfloat
, "{src0} != {src1}",
760 "{src0} || {src1}", "{src}")
761 binop_reduce_all_sizes("ball_iequal", 1, tint
, "{src0} == {src1}",
762 "{src0} && {src1}", "{src}")
763 binop_reduce_all_sizes("bany_inequal", 1, tint
, "{src0} != {src1}",
764 "{src0} || {src1}", "{src}")
766 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
768 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
769 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
770 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
771 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
773 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
774 # and false respectively
776 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
777 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
778 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
779 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
781 # SPIRV shifts are undefined for shift-operands >= bitsize,
782 # but SM5 shifts are defined to use the least significant bits, only
783 # The NIR definition is according to the SM5 specification.
784 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
785 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
786 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
787 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
788 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
789 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
791 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
792 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
793 dst = (src0 << (src1 & rotate_mask)) |
794 (src0 >> (-src1 & rotate_mask));
796 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
797 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
798 dst = (src0 >> (src1 & rotate_mask)) |
799 (src0 << (-src1 & rotate_mask));
802 # bitwise logic operators
804 # These are also used as boolean and, or, xor for hardware supporting
808 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
809 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
810 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
813 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
816 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
817 "{src0} * {src1}", "{src0} + {src1}", "{src}")
819 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
820 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
821 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
822 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
824 binop("fmin", tfloat
, _2src_commutative
+ associative
, "fmin(src0, src1)")
825 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
826 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
827 binop("fmax", tfloat
, _2src_commutative
+ associative
, "fmax(src0, src1)")
828 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
829 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
831 # Saturated vector add for 4 8bit ints.
832 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
834 for (int i = 0; i < 32; i += 8) {
835 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
839 # Saturated vector subtract for 4 8bit ints.
840 binop("ussub_4x8", tint32
, "", """
842 for (int i = 0; i < 32; i += 8) {
843 int src0_chan = (src0 >> i) & 0xff;
844 int src1_chan = (src1 >> i) & 0xff;
845 if (src0_chan > src1_chan)
846 dst |= (src0_chan - src1_chan) << i;
850 # vector min for 4 8bit ints.
851 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
853 for (int i = 0; i < 32; i += 8) {
854 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
858 # vector max for 4 8bit ints.
859 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
861 for (int i = 0; i < 32; i += 8) {
862 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
866 # unorm multiply: (a * b) / 255.
867 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
869 for (int i = 0; i < 32; i += 8) {
870 int src0_chan = (src0 >> i) & 0xff;
871 int src1_chan = (src1 >> i) & 0xff;
872 dst |= ((src0_chan * src1_chan) / 255) << i;
876 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
878 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
879 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
881 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
882 "src0 | ((uint64_t)src1 << 32)")
884 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
885 "src0 | ((uint32_t)src1 << 16)")
887 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
888 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
889 # are from the low five bits of src0 and src1, respectively.
890 binop_convert("bfm", tuint32
, tint32
, "", """
891 int bits = src0 & 0x1F;
892 int offset = src1 & 0x1F;
893 dst = ((1u << bits) - 1) << offset;
896 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
897 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
898 /* flush denormals to zero. */
900 dst = copysignf(0.0f, src0);
903 # Combines the first component of each input to make a 2-component vector.
905 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
911 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
912 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
915 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
916 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
919 def triop(name
, ty
, alg_props
, const_expr
):
920 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
921 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
922 opcode(name
, output_size
, tuint
,
923 [src1_size
, src2_size
, src3_size
],
924 [tuint
, tuint
, tuint
], False, "", const_expr
)
926 triop("ffma", tfloat
, _2src_commutative
, """
927 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
929 dst = _mesa_double_fma_rtz(src0, src1, src2);
930 else if (bit_size == 32)
931 dst = _mesa_float_fma_rtz(src0, src1, src2);
933 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
936 dst = fmaf(src0, src1, src2);
938 dst = fma(src0, src1, src2);
942 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
946 # A vector conditional select instruction (like ?:, but operating per-
947 # component on vectors). There are two versions, one for floating point
948 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
951 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
954 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
955 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
956 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
958 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
959 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
960 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
962 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
963 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
964 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
966 opcode("bcsel", 0, tuint
, [0, 0, 0],
967 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
968 opcode("b8csel", 0, tuint
, [0, 0, 0],
969 [tbool8
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
970 opcode("b16csel", 0, tuint
, [0, 0, 0],
971 [tbool16
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
972 opcode("b32csel", 0, tuint
, [0, 0, 0],
973 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
976 triop("bfi", tuint32
, "", """
977 unsigned mask = src0, insert = src1, base = src2;
986 dst = (base & ~mask) | (insert & mask);
991 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
993 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
994 opcode("ubfe", 0, tuint32
,
995 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
996 unsigned base = src0;
997 unsigned offset = src1 & 0x1F;
998 unsigned bits = src2 & 0x1F;
1001 } else if (offset + bits < 32) {
1002 dst = (base << (32 - bits - offset)) >> (32 - bits);
1004 dst = base >> offset;
1007 opcode("ibfe", 0, tint32
,
1008 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
1010 unsigned offset = src1 & 0x1F;
1011 unsigned bits = src2 & 0x1F;
1014 } else if (offset + bits < 32) {
1015 dst = (base << (32 - bits - offset)) >> (32 - bits);
1017 dst = base >> offset;
1021 # GLSL bitfieldExtract()
1022 opcode("ubitfield_extract", 0, tuint32
,
1023 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
1024 unsigned base = src0;
1025 int offset = src1, bits = src2;
1028 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1029 dst = 0; /* undefined per the spec */
1031 dst = (base >> offset) & ((1ull << bits) - 1);
1034 opcode("ibitfield_extract", 0, tint32
,
1035 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
1037 int offset = src1, bits = src2;
1040 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1043 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1047 # Combines the first component of each input to make a 3-component vector.
1049 triop_horiz("vec3", 3, 1, 1, 1, """
1055 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
1056 src4_size
, const_expr
):
1057 opcode(name
, output_size
, tuint
,
1058 [src1_size
, src2_size
, src3_size
, src4_size
],
1059 [tuint
, tuint
, tuint
, tuint
],
1060 False, "", const_expr
)
1062 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
1063 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
1064 unsigned base = src0, insert = src1;
1065 int offset = src2, bits = src3;
1068 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1071 unsigned mask = ((1ull << bits) - 1) << offset;
1072 dst = (base & ~mask) | ((insert << offset) & mask);
1076 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1083 opcode("vec8", 8, tuint
,
1084 [1] * 8, [tuint
] * 8,
1096 opcode("vec16", 16, tuint
,
1097 [1] * 16, [tuint
] * 16,
1117 # An integer multiply instruction for address calculation. This is
1118 # similar to imul, except that the results are undefined in case of
1119 # overflow. Overflow is defined according to the size of the variable
1120 # being dereferenced.
1122 # This relaxed definition, compared to imul, allows an optimization
1123 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1124 # sources, such that lower precision integer multiplies can be used.
1125 # This is useful on hw that has 24b or perhaps 16b integer multiply
1127 binop("amul", tint
, _2src_commutative
+ associative
, "src0 * src1")
1129 # ir3-specific instruction that maps directly to mul-add shift high mix,
1130 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1131 # multiplication (imul) on Freedreno backend..
1132 opcode("imadsh_mix16", 1, tint32
,
1133 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
1134 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1137 # ir3-specific instruction that maps directly to ir3 mad.s24.
1139 # 24b multiply into 32b result (with sign extension) plus 32b int
1140 triop("imad24_ir3", tint32
, _2src_commutative
,
1141 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1143 # 24b multiply into 32b result (with sign extension)
1144 binop("imul24", tint32
, _2src_commutative
+ associative
,
1145 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")