2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4 or (output_size
== 8) or (output_size
== 16)
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
112 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
114 def type_has_size(type_
):
115 m
= _TYPE_SPLIT_RE
.match(type_
)
116 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
117 return m
.group('bits') is not None
119 def type_size(type_
):
120 m
= _TYPE_SPLIT_RE
.match(type_
)
121 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
122 assert m
.group('bits') is not None, \
123 'NIR type string has no bit size: "{}"'.format(type_
)
124 return int(m
.group('bits'))
126 def type_sizes(type_
):
127 if type_has_size(type_
):
128 return [type_size(type_
)]
129 elif type_
== 'bool':
130 return [1, 8, 16, 32]
131 elif type_
== 'float':
134 return [1, 8, 16, 32, 64]
136 def type_base_type(type_
):
137 m
= _TYPE_SPLIT_RE
.match(type_
)
138 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
139 return m
.group('type')
141 # Operation where the first two sources are commutative.
143 # For 2-source operations, this just mathematical commutativity. Some
144 # 3-source operations, like ffma, are only commutative in the first two
146 _2src_commutative
= "2src_commutative "
147 associative
= "associative "
149 # global dictionary of opcodes
152 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
153 is_conversion
, algebraic_properties
, const_expr
):
154 assert name
not in opcodes
155 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
156 input_types
, is_conversion
, algebraic_properties
,
159 def unop_convert(name
, out_type
, in_type
, const_expr
):
160 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
162 def unop(name
, ty
, const_expr
):
163 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
165 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
167 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
168 False, "", const_expr
)
170 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
171 reduce_expr
, final_expr
):
173 return "(" + prereduce_expr
.format(src
=src
) + ")"
175 return final_expr
.format(src
="(" + src
+ ")")
176 def reduce_(src0
, src1
):
177 return reduce_expr
.format(src0
=src0
, src1
=src1
)
178 src0
= prereduce("src0.x")
179 src1
= prereduce("src0.y")
180 src2
= prereduce("src0.z")
181 src3
= prereduce("src0.w")
182 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
183 final(reduce_(src0
, src1
)))
184 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
185 final(reduce_(reduce_(src0
, src1
), src2
)))
186 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
187 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
189 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
190 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
192 unop("mov", tuint
, "src0")
194 unop("ineg", tint
, "-src0")
195 unop("fneg", tfloat
, "-src0")
196 unop("inot", tint
, "~src0") # invert every bit of the integer
197 unop("fsign", tfloat
, ("bit_size == 64 ? " +
198 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
199 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
200 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
201 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
202 unop("fabs", tfloat
, "fabs(src0)")
203 unop("fsat", tfloat
, ("fmin(fmax(src0, 0.0), 1.0)"))
204 unop("fsat_signed", tfloat
, ("fmin(fmax(src0, -1.0), 1.0)"))
205 unop("fclamp_pos", tfloat
, ("fmax(src0, 0.0)"))
206 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
207 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
208 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
209 unop("fexp2", tfloat
, "exp2f(src0)")
210 unop("flog2", tfloat
, "log2f(src0)")
212 # Generate all of the numeric conversion opcodes
213 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
215 dst_types
= [tfloat
, tint
, tbool
]
217 dst_types
= [tfloat
, tint
, tbool
]
219 dst_types
= [tfloat
, tuint
]
220 elif src_t
== tfloat
:
221 dst_types
= [tint
, tuint
, tfloat
, tbool
]
223 for dst_t
in dst_types
:
224 for dst_bit_size
in type_sizes(dst_t
):
225 if dst_bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
226 rnd_modes
= ['_rtne', '_rtz', '']
227 for rnd_mode
in rnd_modes
:
228 if rnd_mode
== '_rtne':
231 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
236 elif rnd_mode
== '_rtz':
239 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
247 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0],
251 dst_t
+ str(dst_bit_size
),
253 elif dst_bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
255 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
256 dst = _mesa_double_to_float_rtz(src0);
261 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
263 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
265 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
266 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
268 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
270 # Special opcode that is the same as f2f16 except that it is safe to remove it
271 # if the result is immediately converted back to float32 again. This is
272 # generated as part of the precision lowering pass. mp stands for medium
274 unop_numeric_convert("f2fmp", tfloat16
, tfloat
, opcodes
["f2f16"].const_expr
)
276 # Unary floating-point rounding operations.
279 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
280 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
281 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
282 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
283 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
285 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
287 # Trigonometric operations.
290 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
291 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
294 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
295 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
297 # Partial derivatives.
300 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
301 unop("fddy", tfloat
, "0.0")
302 unop("fddx_fine", tfloat
, "0.0")
303 unop("fddy_fine", tfloat
, "0.0")
304 unop("fddx_coarse", tfloat
, "0.0")
305 unop("fddy_coarse", tfloat
, "0.0")
308 # Floating point pack and unpack operations.
311 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
312 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
313 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
314 """.replace("fmt", fmt
))
317 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
318 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
319 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
320 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
321 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
322 """.replace("fmt", fmt
))
324 def unpack_2x16(fmt
):
325 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
326 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
327 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
328 """.replace("fmt", fmt
))
331 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
332 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
333 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
334 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
335 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
336 """.replace("fmt", fmt
))
350 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
351 dst.x = (src0.x & 0xffff) | (src0.y << 16);
354 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
355 dst.x = (src0.x << 0) |
361 unop_horiz("pack_32_4x8", 1, tuint32
, 4, tuint8
,
362 "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
364 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
365 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
367 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
368 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
370 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
371 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
373 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
374 "dst.x = src0.x; dst.y = src0.x >> 32;")
376 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
377 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
379 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
380 "dst.x = src0.x; dst.y = src0.x >> 16;")
382 unop_horiz("unpack_32_4x8", 4, tuint8
, 1, tuint32
,
383 "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
385 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32
, 1, tuint32
, """
386 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
387 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
390 # Lowered floating point unpacking operations.
392 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
393 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
394 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
395 "unpack_half_1x16((uint16_t)(src0 >> 16))")
397 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32
, tuint32
,
398 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
399 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32
, tuint32
,
400 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
402 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
403 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
405 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
406 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
408 # Bit operations, part of ARB_gpu_shader5.
411 unop("bitfield_reverse", tuint32
, """
412 /* we're not winning any awards for speed here, but that's ok */
414 for (unsigned bit = 0; bit < 32; bit++)
415 dst |= ((src0 >> bit) & 1) << (31 - bit);
417 unop_convert("bit_count", tuint32
, tuint
, """
419 for (unsigned bit = 0; bit < bit_size; bit++) {
420 if ((src0 >> bit) & 1)
425 unop_convert("ufind_msb", tint32
, tuint
, """
427 for (int bit = bit_size - 1; bit >= 0; bit--) {
428 if ((src0 >> bit) & 1) {
435 unop("uclz", tuint32
, """
437 for (bit = bit_size - 1; bit >= 0; bit--) {
438 if ((src0 & (1u << bit)) != 0)
441 dst = (unsigned)(31 - bit);
444 unop("ifind_msb", tint32
, """
446 for (int bit = 31; bit >= 0; bit--) {
447 /* If src0 < 0, we're looking for the first 0 bit.
448 * if src0 >= 0, we're looking for the first 1 bit.
450 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
451 (!((src0 >> bit) & 1) && (src0 < 0))) {
458 unop_convert("find_lsb", tint32
, tint
, """
460 for (unsigned bit = 0; bit < bit_size; bit++) {
461 if ((src0 >> bit) & 1) {
468 # AMD_gcn_shader extended instructions
469 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
471 float absX = fabsf(src0.x);
472 float absY = fabsf(src0.y);
473 float absZ = fabsf(src0.z);
476 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
477 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
478 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
480 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
481 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
482 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
483 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
484 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
485 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
487 dst.x = dst.x / ma + 0.5;
488 dst.y = dst.y / ma + 0.5;
491 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
492 float absX = fabsf(src0.x);
493 float absY = fabsf(src0.y);
494 float absZ = fabsf(src0.z);
495 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
496 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
497 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
498 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
499 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
500 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
503 # Sum of vector components
504 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
506 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
507 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
508 False, alg_props
, const_expr
)
510 def binop(name
, ty
, alg_props
, const_expr
):
511 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
513 def binop_compare(name
, ty
, alg_props
, const_expr
):
514 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
516 def binop_compare8(name
, ty
, alg_props
, const_expr
):
517 binop_convert(name
, tbool8
, ty
, alg_props
, const_expr
)
519 def binop_compare16(name
, ty
, alg_props
, const_expr
):
520 binop_convert(name
, tbool16
, ty
, alg_props
, const_expr
)
522 def binop_compare32(name
, ty
, alg_props
, const_expr
):
523 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
525 def binop_compare_all_sizes(name
, ty
, alg_props
, const_expr
):
526 binop_compare(name
, ty
, alg_props
, const_expr
)
527 binop_compare8(name
+ "8", ty
, alg_props
, const_expr
)
528 binop_compare16(name
+ "16", ty
, alg_props
, const_expr
)
529 binop_compare32(name
+ "32", ty
, alg_props
, const_expr
)
531 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
532 src2_type
, const_expr
):
533 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
534 False, "", const_expr
)
536 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
537 reduce_expr
, final_expr
):
539 return final_expr
.format(src
= "(" + src
+ ")")
540 def reduce_(src0
, src1
):
541 return reduce_expr
.format(src0
=src0
, src1
=src1
)
542 def prereduce(src0
, src1
):
543 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
544 src0
= prereduce("src0.x", "src1.x")
545 src1
= prereduce("src0.y", "src1.y")
546 src2
= prereduce("src0.z", "src1.z")
547 src3
= prereduce("src0.w", "src1.w")
548 opcode(name
+ "2", output_size
, output_type
,
549 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
550 final(reduce_(src0
, src1
)))
551 opcode(name
+ "3", output_size
, output_type
,
552 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
553 final(reduce_(reduce_(src0
, src1
), src2
)))
554 opcode(name
+ "4", output_size
, output_type
,
555 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
556 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
558 def binop_reduce_all_sizes(name
, output_size
, src_type
, prereduce_expr
,
559 reduce_expr
, final_expr
):
560 binop_reduce(name
, output_size
, tbool1
, src_type
,
561 prereduce_expr
, reduce_expr
, final_expr
)
562 binop_reduce("b8" + name
[1:], output_size
, tbool8
, src_type
,
563 prereduce_expr
, reduce_expr
, final_expr
)
564 binop_reduce("b16" + name
[1:], output_size
, tbool16
, src_type
,
565 prereduce_expr
, reduce_expr
, final_expr
)
566 binop_reduce("b32" + name
[1:], output_size
, tbool32
, src_type
,
567 prereduce_expr
, reduce_expr
, final_expr
)
569 binop("fadd", tfloat
, _2src_commutative
+ associative
,"""
570 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
572 dst = _mesa_double_add_rtz(src0, src1);
574 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
579 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
580 binop("iadd_sat", tint
, _2src_commutative
, """
582 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
583 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
585 binop("uadd_sat", tuint
, _2src_commutative
,
586 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
587 binop("isub_sat", tint
, "", """
589 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
590 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
592 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
594 binop("fsub", tfloat
, "", """
595 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
597 dst = _mesa_double_sub_rtz(src0, src1);
599 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
604 binop("isub", tint
, "", "src0 - src1")
605 binop_convert("uabs_isub", tuint
, tint
, "", """
606 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
607 : (uint64_t) src0 - (uint64_t) src1
609 binop("uabs_usub", tuint
, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
611 binop("fmul", tfloat
, _2src_commutative
+ associative
, """
612 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
614 dst = _mesa_double_mul_rtz(src0, src1);
616 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
621 # low 32-bits of signed/unsigned integer multiply
622 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
624 # Generate 64 bit result from 2 32 bits quantity
625 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
626 "(int64_t)src0 * (int64_t)src1")
627 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
628 "(uint64_t)src0 * (uint64_t)src1")
630 # high 32-bits of signed integer multiply
631 binop("imul_high", tint
, _2src_commutative
, """
632 if (bit_size == 64) {
633 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
634 * extension to work properly. The casts are kind-of annoying but needed
635 * to prevent compiler warnings.
637 uint32_t src0_u32[4] = {
643 uint32_t src1_u32[4] = {
649 uint32_t prod_u32[4];
650 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
651 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
653 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
657 # high 32-bits of unsigned integer multiply
658 binop("umul_high", tuint
, _2src_commutative
, """
659 if (bit_size == 64) {
660 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
661 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
662 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
663 uint32_t prod_u32[4];
664 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
665 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
667 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
671 # low 32-bits of unsigned integer multiply
672 binop("umul_low", tuint32
, _2src_commutative
, """
673 uint64_t mask = (1 << (bit_size / 2)) - 1;
674 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
677 # Multiply 32-bits with low 16-bits.
678 binop("imul_32x16", tint32
, "", "src0 * (int16_t) src1")
679 binop("umul_32x16", tuint32
, "", "src0 * (uint16_t) src1")
681 binop("fdiv", tfloat
, "", "src0 / src1")
682 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
683 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
685 # returns a boolean representing the carry resulting from the addition of
686 # the two unsigned arguments.
688 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
690 # returns a boolean representing the borrow resulting from the subtraction
691 # of the two unsigned arguments.
693 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
695 # hadd: (a + b) >> 1 (without overflow)
696 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
697 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
698 # = 2 * (x & y) + (x & ~y) + (~x & y)
699 # = ((x & y) << 1) + (x ^ y)
701 # Since we know that the bottom bit of (x & y) << 1 is zero,
703 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
704 # = (x & y) + ((x ^ y) >> 1)
705 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
706 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
708 # rhadd: (a + b + 1) >> 1 (without overflow)
709 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
710 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
711 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
712 # = ((x | y) << 1) - (x ^ y) + 1
714 # Since we know that the bottom bit of (x & y) << 1 is zero,
716 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
717 # = (x | y) - ((x ^ y) >> 1)
718 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
719 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
721 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
723 # For signed integers, there are several different possible definitions of
724 # "modulus" or "remainder". We follow the conventions used by LLVM and
725 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
726 # operation while the imod opcode implements the more mathematical
727 # "modulus" operation. For details on the difference, see
729 # http://mathforum.org/library/drmath/view/52343.html
731 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
732 binop("imod", tint
, "",
733 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
734 " src0 % src1 : src0 % src1 + src1)")
735 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
736 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
743 # these integer-aware comparisons return a boolean (0 or ~0)
745 binop_compare_all_sizes("flt", tfloat
, "", "src0 < src1")
746 binop_compare_all_sizes("fge", tfloat
, "", "src0 >= src1")
747 binop_compare_all_sizes("feq", tfloat
, _2src_commutative
, "src0 == src1")
748 binop_compare_all_sizes("fne", tfloat
, _2src_commutative
, "src0 != src1")
749 binop_compare_all_sizes("ilt", tint
, "", "src0 < src1")
750 binop_compare_all_sizes("ige", tint
, "", "src0 >= src1")
751 binop_compare_all_sizes("ieq", tint
, _2src_commutative
, "src0 == src1")
752 binop_compare_all_sizes("ine", tint
, _2src_commutative
, "src0 != src1")
753 binop_compare_all_sizes("ult", tuint
, "", "src0 < src1")
754 binop_compare_all_sizes("uge", tuint
, "", "src0 >= src1")
756 # integer-aware GLSL-style comparisons that compare floats and ints
758 binop_reduce_all_sizes("ball_fequal", 1, tfloat
, "{src0} == {src1}",
759 "{src0} && {src1}", "{src}")
760 binop_reduce_all_sizes("bany_fnequal", 1, tfloat
, "{src0} != {src1}",
761 "{src0} || {src1}", "{src}")
762 binop_reduce_all_sizes("ball_iequal", 1, tint
, "{src0} == {src1}",
763 "{src0} && {src1}", "{src}")
764 binop_reduce_all_sizes("bany_inequal", 1, tint
, "{src0} != {src1}",
765 "{src0} || {src1}", "{src}")
767 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
769 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
770 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
771 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
772 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
774 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
775 # and false respectively
777 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
778 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
779 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
780 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
782 # SPIRV shifts are undefined for shift-operands >= bitsize,
783 # but SM5 shifts are defined to use the least significant bits, only
784 # The NIR definition is according to the SM5 specification.
785 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
786 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
787 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
788 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
789 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
790 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
792 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
793 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
794 dst = (src0 << (src1 & rotate_mask)) |
795 (src0 >> (-src1 & rotate_mask));
797 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
798 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
799 dst = (src0 >> (src1 & rotate_mask)) |
800 (src0 << (-src1 & rotate_mask));
803 # bitwise logic operators
805 # These are also used as boolean and, or, xor for hardware supporting
809 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
810 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
811 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
814 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
817 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
818 "{src0} * {src1}", "{src0} + {src1}", "{src}")
820 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
821 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
822 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
823 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
825 binop("fmin", tfloat
, _2src_commutative
+ associative
, "fmin(src0, src1)")
826 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
827 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
828 binop("fmax", tfloat
, _2src_commutative
+ associative
, "fmax(src0, src1)")
829 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
830 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
832 # Saturated vector add for 4 8bit ints.
833 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
835 for (int i = 0; i < 32; i += 8) {
836 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
840 # Saturated vector subtract for 4 8bit ints.
841 binop("ussub_4x8", tint32
, "", """
843 for (int i = 0; i < 32; i += 8) {
844 int src0_chan = (src0 >> i) & 0xff;
845 int src1_chan = (src1 >> i) & 0xff;
846 if (src0_chan > src1_chan)
847 dst |= (src0_chan - src1_chan) << i;
851 # vector min for 4 8bit ints.
852 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
854 for (int i = 0; i < 32; i += 8) {
855 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
859 # vector max for 4 8bit ints.
860 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
862 for (int i = 0; i < 32; i += 8) {
863 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
867 # unorm multiply: (a * b) / 255.
868 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
870 for (int i = 0; i < 32; i += 8) {
871 int src0_chan = (src0 >> i) & 0xff;
872 int src1_chan = (src1 >> i) & 0xff;
873 dst |= ((src0_chan * src1_chan) / 255) << i;
877 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
879 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
880 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
882 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
883 "src0 | ((uint64_t)src1 << 32)")
885 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
886 "src0 | ((uint32_t)src1 << 16)")
888 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
889 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
890 # are from the low five bits of src0 and src1, respectively.
891 binop_convert("bfm", tuint32
, tint32
, "", """
892 int bits = src0 & 0x1F;
893 int offset = src1 & 0x1F;
894 dst = ((1u << bits) - 1) << offset;
897 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
898 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
899 /* flush denormals to zero. */
901 dst = copysignf(0.0f, src0);
904 # Combines the first component of each input to make a 2-component vector.
906 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
912 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
913 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
916 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
917 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
920 def triop(name
, ty
, alg_props
, const_expr
):
921 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
922 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
923 opcode(name
, output_size
, tuint
,
924 [src1_size
, src2_size
, src3_size
],
925 [tuint
, tuint
, tuint
], False, "", const_expr
)
927 triop("ffma", tfloat
, _2src_commutative
, """
928 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
930 dst = _mesa_double_fma_rtz(src0, src1, src2);
931 else if (bit_size == 32)
932 dst = _mesa_float_fma_rtz(src0, src1, src2);
934 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
937 dst = fmaf(src0, src1, src2);
939 dst = fma(src0, src1, src2);
943 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
947 # A vector conditional select instruction (like ?:, but operating per-
948 # component on vectors). There are two versions, one for floating point
949 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
952 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
955 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
956 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
957 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
959 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
960 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
961 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
963 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
964 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
965 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
967 opcode("bcsel", 0, tuint
, [0, 0, 0],
968 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
969 opcode("b8csel", 0, tuint
, [0, 0, 0],
970 [tbool8
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
971 opcode("b16csel", 0, tuint
, [0, 0, 0],
972 [tbool16
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
973 opcode("b32csel", 0, tuint
, [0, 0, 0],
974 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
977 triop("bfi", tuint32
, "", """
978 unsigned mask = src0, insert = src1, base = src2;
987 dst = (base & ~mask) | (insert & mask);
992 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
994 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
995 opcode("ubfe", 0, tuint32
,
996 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
997 unsigned base = src0;
998 unsigned offset = src1 & 0x1F;
999 unsigned bits = src2 & 0x1F;
1002 } else if (offset + bits < 32) {
1003 dst = (base << (32 - bits - offset)) >> (32 - bits);
1005 dst = base >> offset;
1008 opcode("ibfe", 0, tint32
,
1009 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
1011 unsigned offset = src1 & 0x1F;
1012 unsigned bits = src2 & 0x1F;
1015 } else if (offset + bits < 32) {
1016 dst = (base << (32 - bits - offset)) >> (32 - bits);
1018 dst = base >> offset;
1022 # GLSL bitfieldExtract()
1023 opcode("ubitfield_extract", 0, tuint32
,
1024 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
1025 unsigned base = src0;
1026 int offset = src1, bits = src2;
1029 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1030 dst = 0; /* undefined per the spec */
1032 dst = (base >> offset) & ((1ull << bits) - 1);
1035 opcode("ibitfield_extract", 0, tint32
,
1036 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
1038 int offset = src1, bits = src2;
1041 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1044 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1048 # Combines the first component of each input to make a 3-component vector.
1050 triop_horiz("vec3", 3, 1, 1, 1, """
1056 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
1057 src4_size
, const_expr
):
1058 opcode(name
, output_size
, tuint
,
1059 [src1_size
, src2_size
, src3_size
, src4_size
],
1060 [tuint
, tuint
, tuint
, tuint
],
1061 False, "", const_expr
)
1063 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
1064 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
1065 unsigned base = src0, insert = src1;
1066 int offset = src2, bits = src3;
1069 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1072 unsigned mask = ((1ull << bits) - 1) << offset;
1073 dst = (base & ~mask) | ((insert << offset) & mask);
1077 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1084 opcode("vec8", 8, tuint
,
1085 [1] * 8, [tuint
] * 8,
1097 opcode("vec16", 16, tuint
,
1098 [1] * 16, [tuint
] * 16,
1118 # An integer multiply instruction for address calculation. This is
1119 # similar to imul, except that the results are undefined in case of
1120 # overflow. Overflow is defined according to the size of the variable
1121 # being dereferenced.
1123 # This relaxed definition, compared to imul, allows an optimization
1124 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1125 # sources, such that lower precision integer multiplies can be used.
1126 # This is useful on hw that has 24b or perhaps 16b integer multiply
1128 binop("amul", tint
, _2src_commutative
+ associative
, "src0 * src1")
1130 # ir3-specific instruction that maps directly to mul-add shift high mix,
1131 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1132 # multiplication (imul) on Freedreno backend..
1133 opcode("imadsh_mix16", 0, tint32
,
1134 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
1135 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1138 # ir3-specific instruction that maps directly to ir3 mad.s24.
1140 # 24b multiply into 32b result (with sign extension) plus 32b int
1141 triop("imad24_ir3", tint32
, _2src_commutative
,
1142 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1144 # 24b multiply into 32b result (with sign extension)
1145 binop("imul24", tint32
, _2src_commutative
+ associative
,
1146 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1148 # unsigned 24b multiply into 32b result plus 32b int
1149 triop("umad24", tuint32
, _2src_commutative
,
1150 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1152 # unsigned 24b multiply into 32b result uint
1153 binop("umul24", tint32
, _2src_commutative
+ associative
,
1154 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")