f18668493b22509bd6267b4b26b7702b5a8086e0
2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4 or (output_size
== 8) or (output_size
== 16)
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
113 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
115 def type_has_size(type_
):
116 m
= _TYPE_SPLIT_RE
.match(type_
)
117 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
118 return m
.group('bits') is not None
120 def type_size(type_
):
121 m
= _TYPE_SPLIT_RE
.match(type_
)
122 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
123 assert m
.group('bits') is not None, \
124 'NIR type string has no bit size: "{}"'.format(type_
)
125 return int(m
.group('bits'))
127 def type_sizes(type_
):
128 if type_has_size(type_
):
129 return [type_size(type_
)]
130 elif type_
== 'bool':
131 return [1, 8, 16, 32]
132 elif type_
== 'float':
135 return [1, 8, 16, 32, 64]
137 def type_base_type(type_
):
138 m
= _TYPE_SPLIT_RE
.match(type_
)
139 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
140 return m
.group('type')
142 # Operation where the first two sources are commutative.
144 # For 2-source operations, this just mathematical commutativity. Some
145 # 3-source operations, like ffma, are only commutative in the first two
147 _2src_commutative
= "2src_commutative "
148 associative
= "associative "
150 # global dictionary of opcodes
153 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
154 is_conversion
, algebraic_properties
, const_expr
):
155 assert name
not in opcodes
156 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
157 input_types
, is_conversion
, algebraic_properties
,
160 def unop_convert(name
, out_type
, in_type
, const_expr
):
161 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
163 def unop(name
, ty
, const_expr
):
164 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
166 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
168 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
169 False, "", const_expr
)
171 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
172 reduce_expr
, final_expr
):
174 return "(" + prereduce_expr
.format(src
=src
) + ")"
176 return final_expr
.format(src
="(" + src
+ ")")
177 def reduce_(src0
, src1
):
178 return reduce_expr
.format(src0
=src0
, src1
=src1
)
179 src0
= prereduce("src0.x")
180 src1
= prereduce("src0.y")
181 src2
= prereduce("src0.z")
182 src3
= prereduce("src0.w")
183 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
184 final(reduce_(src0
, src1
)))
185 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
186 final(reduce_(reduce_(src0
, src1
), src2
)))
187 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
188 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
190 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
191 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
193 unop("mov", tuint
, "src0")
195 unop("ineg", tint
, "-src0")
196 unop("fneg", tfloat
, "-src0")
197 unop("inot", tint
, "~src0") # invert every bit of the integer
198 unop("fsign", tfloat
, ("bit_size == 64 ? " +
199 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
200 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
201 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
202 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
203 unop("fabs", tfloat
, "fabs(src0)")
204 unop("fsat", tfloat
, ("fmin(fmax(src0, 0.0), 1.0)"))
205 unop("fsat_signed", tfloat
, ("fmin(fmax(src0, -1.0), 1.0)"))
206 unop("fclamp_pos", tfloat
, ("fmax(src0, 0.0)"))
207 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
208 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
209 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
210 unop("fexp2", tfloat
, "exp2f(src0)")
211 unop("flog2", tfloat
, "log2f(src0)")
213 # Generate all of the numeric conversion opcodes
214 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
216 dst_types
= [tfloat
, tint
, tbool
]
218 dst_types
= [tfloat
, tint
, tbool
]
220 dst_types
= [tfloat
, tuint
]
221 elif src_t
== tfloat
:
222 dst_types
= [tint
, tuint
, tfloat
, tbool
]
224 for dst_t
in dst_types
:
225 for dst_bit_size
in type_sizes(dst_t
):
226 if dst_bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
227 rnd_modes
= ['_rtne', '_rtz', '']
228 for rnd_mode
in rnd_modes
:
229 if rnd_mode
== '_rtne':
232 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
237 elif rnd_mode
== '_rtz':
240 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
248 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0],
252 dst_t
+ str(dst_bit_size
),
254 elif dst_bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
256 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
257 dst = _mesa_double_to_float_rtz(src0);
262 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
264 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
266 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
267 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
269 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
271 # Special opcode that is the same as f2f16, i2i16, u2u16 except that it is safe
272 # to remove it if the result is immediately converted back to 32 bits again.
273 # This is generated as part of the precision lowering pass. mp stands for medium
275 unop_numeric_convert("f2fmp", tfloat16
, tfloat
, opcodes
["f2f16"].const_expr
)
276 unop_numeric_convert("i2imp", tint16
, tint
, opcodes
["i2i16"].const_expr
)
277 unop_numeric_convert("u2ump", tuint16
, tuint
, opcodes
["u2u16"].const_expr
)
279 # Unary floating-point rounding operations.
282 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
283 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
284 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
285 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
286 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
288 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
290 # Trigonometric operations.
293 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
294 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
297 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
298 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
300 # Partial derivatives.
303 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
304 unop("fddy", tfloat
, "0.0")
305 unop("fddx_fine", tfloat
, "0.0")
306 unop("fddy_fine", tfloat
, "0.0")
307 unop("fddx_coarse", tfloat
, "0.0")
308 unop("fddy_coarse", tfloat
, "0.0")
311 # Floating point pack and unpack operations.
314 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
315 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
316 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
317 """.replace("fmt", fmt
))
320 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
321 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
322 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
323 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
324 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
325 """.replace("fmt", fmt
))
327 def unpack_2x16(fmt
):
328 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
329 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
330 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
331 """.replace("fmt", fmt
))
334 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
335 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
336 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
337 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
338 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
339 """.replace("fmt", fmt
))
353 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
354 dst.x = (src0.x & 0xffff) | (src0.y << 16);
357 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
358 dst.x = (src0.x << 0) |
364 unop_horiz("pack_32_4x8", 1, tuint32
, 4, tuint8
,
365 "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
367 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
368 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
370 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
371 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
373 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
374 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
376 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
377 "dst.x = src0.x; dst.y = src0.x >> 32;")
379 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
380 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
382 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
383 "dst.x = src0.x; dst.y = src0.x >> 16;")
385 unop_horiz("unpack_32_4x8", 4, tuint8
, 1, tuint32
,
386 "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
388 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32
, 1, tuint32
, """
389 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
390 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
393 # Lowered floating point unpacking operations.
395 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
396 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
397 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
398 "unpack_half_1x16((uint16_t)(src0 >> 16))")
400 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32
, tuint32
,
401 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
402 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32
, tuint32
,
403 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
405 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
406 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
408 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
409 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
411 # Bit operations, part of ARB_gpu_shader5.
414 unop("bitfield_reverse", tuint32
, """
415 /* we're not winning any awards for speed here, but that's ok */
417 for (unsigned bit = 0; bit < 32; bit++)
418 dst |= ((src0 >> bit) & 1) << (31 - bit);
420 unop_convert("bit_count", tuint32
, tuint
, """
422 for (unsigned bit = 0; bit < bit_size; bit++) {
423 if ((src0 >> bit) & 1)
428 unop_convert("ufind_msb", tint32
, tuint
, """
430 for (int bit = bit_size - 1; bit >= 0; bit--) {
431 if ((src0 >> bit) & 1) {
438 unop("uclz", tuint32
, """
440 for (bit = bit_size - 1; bit >= 0; bit--) {
441 if ((src0 & (1u << bit)) != 0)
444 dst = (unsigned)(31 - bit);
447 unop("ifind_msb", tint32
, """
449 for (int bit = 31; bit >= 0; bit--) {
450 /* If src0 < 0, we're looking for the first 0 bit.
451 * if src0 >= 0, we're looking for the first 1 bit.
453 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
454 (!((src0 >> bit) & 1) && (src0 < 0))) {
461 unop_convert("find_lsb", tint32
, tint
, """
463 for (unsigned bit = 0; bit < bit_size; bit++) {
464 if ((src0 >> bit) & 1) {
471 # AMD_gcn_shader extended instructions
472 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
474 float absX = fabsf(src0.x);
475 float absY = fabsf(src0.y);
476 float absZ = fabsf(src0.z);
479 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
480 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
481 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
483 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
484 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
485 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
486 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
487 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
488 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
490 dst.x = dst.x * (1.0f / ma) + 0.5f;
491 dst.y = dst.y * (1.0f / ma) + 0.5f;
494 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
495 float absX = fabsf(src0.x);
496 float absY = fabsf(src0.y);
497 float absZ = fabsf(src0.z);
498 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
499 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
500 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
501 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
502 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
503 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
506 # Sum of vector components
507 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
509 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
510 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
511 False, alg_props
, const_expr
)
513 def binop(name
, ty
, alg_props
, const_expr
):
514 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
516 def binop_compare(name
, ty
, alg_props
, const_expr
):
517 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
519 def binop_compare8(name
, ty
, alg_props
, const_expr
):
520 binop_convert(name
, tbool8
, ty
, alg_props
, const_expr
)
522 def binop_compare16(name
, ty
, alg_props
, const_expr
):
523 binop_convert(name
, tbool16
, ty
, alg_props
, const_expr
)
525 def binop_compare32(name
, ty
, alg_props
, const_expr
):
526 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
528 def binop_compare_all_sizes(name
, ty
, alg_props
, const_expr
):
529 binop_compare(name
, ty
, alg_props
, const_expr
)
530 binop_compare8(name
+ "8", ty
, alg_props
, const_expr
)
531 binop_compare16(name
+ "16", ty
, alg_props
, const_expr
)
532 binop_compare32(name
+ "32", ty
, alg_props
, const_expr
)
534 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
535 src2_type
, const_expr
):
536 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
537 False, "", const_expr
)
539 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
540 reduce_expr
, final_expr
):
542 return final_expr
.format(src
= "(" + src
+ ")")
543 def reduce_(src0
, src1
):
544 return reduce_expr
.format(src0
=src0
, src1
=src1
)
545 def prereduce(src0
, src1
):
546 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
547 src0
= prereduce("src0.x", "src1.x")
548 src1
= prereduce("src0.y", "src1.y")
549 src2
= prereduce("src0.z", "src1.z")
550 src3
= prereduce("src0.w", "src1.w")
551 opcode(name
+ "2", output_size
, output_type
,
552 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
553 final(reduce_(src0
, src1
)))
554 opcode(name
+ "3", output_size
, output_type
,
555 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
556 final(reduce_(reduce_(src0
, src1
), src2
)))
557 opcode(name
+ "4", output_size
, output_type
,
558 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
559 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
561 def binop_reduce_all_sizes(name
, output_size
, src_type
, prereduce_expr
,
562 reduce_expr
, final_expr
):
563 binop_reduce(name
, output_size
, tbool1
, src_type
,
564 prereduce_expr
, reduce_expr
, final_expr
)
565 binop_reduce("b8" + name
[1:], output_size
, tbool8
, src_type
,
566 prereduce_expr
, reduce_expr
, final_expr
)
567 binop_reduce("b16" + name
[1:], output_size
, tbool16
, src_type
,
568 prereduce_expr
, reduce_expr
, final_expr
)
569 binop_reduce("b32" + name
[1:], output_size
, tbool32
, src_type
,
570 prereduce_expr
, reduce_expr
, final_expr
)
572 binop("fadd", tfloat
, _2src_commutative
+ associative
,"""
573 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
575 dst = _mesa_double_add_rtz(src0, src1);
577 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
582 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
583 binop("iadd_sat", tint
, _2src_commutative
, """
585 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
586 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
588 binop("uadd_sat", tuint
, _2src_commutative
,
589 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
590 binop("isub_sat", tint
, "", """
592 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
593 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
595 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
597 binop("fsub", tfloat
, "", """
598 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
600 dst = _mesa_double_sub_rtz(src0, src1);
602 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
607 binop("isub", tint
, "", "src0 - src1")
608 binop_convert("uabs_isub", tuint
, tint
, "", """
609 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
610 : (uint64_t) src0 - (uint64_t) src1
612 binop("uabs_usub", tuint
, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
614 binop("fmul", tfloat
, _2src_commutative
+ associative
, """
615 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
617 dst = _mesa_double_mul_rtz(src0, src1);
619 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
624 # low 32-bits of signed/unsigned integer multiply
625 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
627 # Generate 64 bit result from 2 32 bits quantity
628 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
629 "(int64_t)src0 * (int64_t)src1")
630 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
631 "(uint64_t)src0 * (uint64_t)src1")
633 # high 32-bits of signed integer multiply
634 binop("imul_high", tint
, _2src_commutative
, """
635 if (bit_size == 64) {
636 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
637 * extension to work properly. The casts are kind-of annoying but needed
638 * to prevent compiler warnings.
640 uint32_t src0_u32[4] = {
646 uint32_t src1_u32[4] = {
652 uint32_t prod_u32[4];
653 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
654 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
656 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
660 # high 32-bits of unsigned integer multiply
661 binop("umul_high", tuint
, _2src_commutative
, """
662 if (bit_size == 64) {
663 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
664 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
665 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
666 uint32_t prod_u32[4];
667 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
668 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
670 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
674 # low 32-bits of unsigned integer multiply
675 binop("umul_low", tuint32
, _2src_commutative
, """
676 uint64_t mask = (1 << (bit_size / 2)) - 1;
677 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
680 # Multiply 32-bits with low 16-bits.
681 binop("imul_32x16", tint32
, "", "src0 * (int16_t) src1")
682 binop("umul_32x16", tuint32
, "", "src0 * (uint16_t) src1")
684 binop("fdiv", tfloat
, "", "src0 / src1")
685 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
686 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
688 # returns a boolean representing the carry resulting from the addition of
689 # the two unsigned arguments.
691 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
693 # returns a boolean representing the borrow resulting from the subtraction
694 # of the two unsigned arguments.
696 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
698 # hadd: (a + b) >> 1 (without overflow)
699 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
700 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
701 # = 2 * (x & y) + (x & ~y) + (~x & y)
702 # = ((x & y) << 1) + (x ^ y)
704 # Since we know that the bottom bit of (x & y) << 1 is zero,
706 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
707 # = (x & y) + ((x ^ y) >> 1)
708 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
709 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
711 # rhadd: (a + b + 1) >> 1 (without overflow)
712 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
713 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
714 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
715 # = ((x | y) << 1) - (x ^ y) + 1
717 # Since we know that the bottom bit of (x & y) << 1 is zero,
719 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
720 # = (x | y) - ((x ^ y) >> 1)
721 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
722 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
724 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
726 # For signed integers, there are several different possible definitions of
727 # "modulus" or "remainder". We follow the conventions used by LLVM and
728 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
729 # operation while the imod opcode implements the more mathematical
730 # "modulus" operation. For details on the difference, see
732 # http://mathforum.org/library/drmath/view/52343.html
734 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
735 binop("imod", tint
, "",
736 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
737 " src0 % src1 : src0 % src1 + src1)")
738 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
739 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
746 # these integer-aware comparisons return a boolean (0 or ~0)
748 binop_compare_all_sizes("flt", tfloat
, "", "src0 < src1")
749 binop_compare_all_sizes("fge", tfloat
, "", "src0 >= src1")
750 binop_compare_all_sizes("feq", tfloat
, _2src_commutative
, "src0 == src1")
751 binop_compare_all_sizes("fne", tfloat
, _2src_commutative
, "src0 != src1")
752 binop_compare_all_sizes("ilt", tint
, "", "src0 < src1")
753 binop_compare_all_sizes("ige", tint
, "", "src0 >= src1")
754 binop_compare_all_sizes("ieq", tint
, _2src_commutative
, "src0 == src1")
755 binop_compare_all_sizes("ine", tint
, _2src_commutative
, "src0 != src1")
756 binop_compare_all_sizes("ult", tuint
, "", "src0 < src1")
757 binop_compare_all_sizes("uge", tuint
, "", "src0 >= src1")
759 # integer-aware GLSL-style comparisons that compare floats and ints
761 binop_reduce_all_sizes("ball_fequal", 1, tfloat
, "{src0} == {src1}",
762 "{src0} && {src1}", "{src}")
763 binop_reduce_all_sizes("bany_fnequal", 1, tfloat
, "{src0} != {src1}",
764 "{src0} || {src1}", "{src}")
765 binop_reduce_all_sizes("ball_iequal", 1, tint
, "{src0} == {src1}",
766 "{src0} && {src1}", "{src}")
767 binop_reduce_all_sizes("bany_inequal", 1, tint
, "{src0} != {src1}",
768 "{src0} || {src1}", "{src}")
770 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
772 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
773 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
774 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
775 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
777 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
778 # and false respectively
780 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
781 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
782 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
783 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
785 # SPIRV shifts are undefined for shift-operands >= bitsize,
786 # but SM5 shifts are defined to use the least significant bits, only
787 # The NIR definition is according to the SM5 specification.
788 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
789 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
790 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
791 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
792 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
793 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
795 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
796 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
797 dst = (src0 << (src1 & rotate_mask)) |
798 (src0 >> (-src1 & rotate_mask));
800 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
801 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
802 dst = (src0 >> (src1 & rotate_mask)) |
803 (src0 << (-src1 & rotate_mask));
806 # bitwise logic operators
808 # These are also used as boolean and, or, xor for hardware supporting
812 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
813 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
814 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
817 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
820 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
821 "{src0} * {src1}", "{src0} + {src1}", "{src}")
823 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
824 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
825 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
826 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
828 binop("fmin", tfloat
, _2src_commutative
+ associative
, "fmin(src0, src1)")
829 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
830 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
831 binop("fmax", tfloat
, _2src_commutative
+ associative
, "fmax(src0, src1)")
832 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
833 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
835 # Saturated vector add for 4 8bit ints.
836 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
838 for (int i = 0; i < 32; i += 8) {
839 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
843 # Saturated vector subtract for 4 8bit ints.
844 binop("ussub_4x8", tint32
, "", """
846 for (int i = 0; i < 32; i += 8) {
847 int src0_chan = (src0 >> i) & 0xff;
848 int src1_chan = (src1 >> i) & 0xff;
849 if (src0_chan > src1_chan)
850 dst |= (src0_chan - src1_chan) << i;
854 # vector min for 4 8bit ints.
855 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
857 for (int i = 0; i < 32; i += 8) {
858 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
862 # vector max for 4 8bit ints.
863 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
865 for (int i = 0; i < 32; i += 8) {
866 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
870 # unorm multiply: (a * b) / 255.
871 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
873 for (int i = 0; i < 32; i += 8) {
874 int src0_chan = (src0 >> i) & 0xff;
875 int src1_chan = (src1 >> i) & 0xff;
876 dst |= ((src0_chan * src1_chan) / 255) << i;
880 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
882 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
883 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
885 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
886 "src0 | ((uint64_t)src1 << 32)")
888 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
889 "src0 | ((uint32_t)src1 << 16)")
891 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
892 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
893 # are from the low five bits of src0 and src1, respectively.
894 binop_convert("bfm", tuint32
, tint32
, "", """
895 int bits = src0 & 0x1F;
896 int offset = src1 & 0x1F;
897 dst = ((1u << bits) - 1) << offset;
900 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
901 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
902 /* flush denormals to zero. */
904 dst = copysignf(0.0f, src0);
907 # Combines the first component of each input to make a 2-component vector.
909 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
915 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
916 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
919 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
920 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
923 def triop(name
, ty
, alg_props
, const_expr
):
924 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
925 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
926 opcode(name
, output_size
, tuint
,
927 [src1_size
, src2_size
, src3_size
],
928 [tuint
, tuint
, tuint
], False, "", const_expr
)
930 triop("ffma", tfloat
, _2src_commutative
, """
931 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
933 dst = _mesa_double_fma_rtz(src0, src1, src2);
934 else if (bit_size == 32)
935 dst = _mesa_float_fma_rtz(src0, src1, src2);
937 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
940 dst = fmaf(src0, src1, src2);
942 dst = fma(src0, src1, src2);
946 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
950 # A vector conditional select instruction (like ?:, but operating per-
951 # component on vectors). There are two versions, one for floating point
952 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
955 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
958 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
959 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
960 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
962 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
963 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
964 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
966 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
967 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
968 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
970 opcode("bcsel", 0, tuint
, [0, 0, 0],
971 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
972 opcode("b8csel", 0, tuint
, [0, 0, 0],
973 [tbool8
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
974 opcode("b16csel", 0, tuint
, [0, 0, 0],
975 [tbool16
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
976 opcode("b32csel", 0, tuint
, [0, 0, 0],
977 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
980 triop("bfi", tuint32
, "", """
981 unsigned mask = src0, insert = src1, base = src2;
990 dst = (base & ~mask) | (insert & mask);
995 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
997 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
998 opcode("ubfe", 0, tuint32
,
999 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
1000 unsigned base = src0;
1001 unsigned offset = src1 & 0x1F;
1002 unsigned bits = src2 & 0x1F;
1005 } else if (offset + bits < 32) {
1006 dst = (base << (32 - bits - offset)) >> (32 - bits);
1008 dst = base >> offset;
1011 opcode("ibfe", 0, tint32
,
1012 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
1014 unsigned offset = src1 & 0x1F;
1015 unsigned bits = src2 & 0x1F;
1018 } else if (offset + bits < 32) {
1019 dst = (base << (32 - bits - offset)) >> (32 - bits);
1021 dst = base >> offset;
1025 # GLSL bitfieldExtract()
1026 opcode("ubitfield_extract", 0, tuint32
,
1027 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
1028 unsigned base = src0;
1029 int offset = src1, bits = src2;
1032 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1033 dst = 0; /* undefined per the spec */
1035 dst = (base >> offset) & ((1ull << bits) - 1);
1038 opcode("ibitfield_extract", 0, tint32
,
1039 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
1041 int offset = src1, bits = src2;
1044 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1047 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1051 # Combines the first component of each input to make a 3-component vector.
1053 triop_horiz("vec3", 3, 1, 1, 1, """
1059 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
1060 src4_size
, const_expr
):
1061 opcode(name
, output_size
, tuint
,
1062 [src1_size
, src2_size
, src3_size
, src4_size
],
1063 [tuint
, tuint
, tuint
, tuint
],
1064 False, "", const_expr
)
1066 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
1067 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
1068 unsigned base = src0, insert = src1;
1069 int offset = src2, bits = src3;
1072 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1075 unsigned mask = ((1ull << bits) - 1) << offset;
1076 dst = (base & ~mask) | ((insert << offset) & mask);
1080 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1087 opcode("vec8", 8, tuint
,
1088 [1] * 8, [tuint
] * 8,
1100 opcode("vec16", 16, tuint
,
1101 [1] * 16, [tuint
] * 16,
1121 # An integer multiply instruction for address calculation. This is
1122 # similar to imul, except that the results are undefined in case of
1123 # overflow. Overflow is defined according to the size of the variable
1124 # being dereferenced.
1126 # This relaxed definition, compared to imul, allows an optimization
1127 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1128 # sources, such that lower precision integer multiplies can be used.
1129 # This is useful on hw that has 24b or perhaps 16b integer multiply
1131 binop("amul", tint
, _2src_commutative
+ associative
, "src0 * src1")
1133 # ir3-specific instruction that maps directly to mul-add shift high mix,
1134 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1135 # multiplication (imul) on Freedreno backend..
1136 opcode("imadsh_mix16", 0, tint32
,
1137 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
1138 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1141 # ir3-specific instruction that maps directly to ir3 mad.s24.
1143 # 24b multiply into 32b result (with sign extension) plus 32b int
1144 triop("imad24_ir3", tint32
, _2src_commutative
,
1145 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1147 # 24b multiply into 32b result (with sign extension)
1148 binop("imul24", tint32
, _2src_commutative
+ associative
,
1149 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1151 # unsigned 24b multiply into 32b result plus 32b int
1152 triop("umad24", tuint32
, _2src_commutative
,
1153 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1155 # unsigned 24b multiply into 32b result uint
1156 binop("umul24", tint32
, _2src_commutative
+ associative
,
1157 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")