2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
108 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
110 def type_has_size(type_
):
111 m
= _TYPE_SPLIT_RE
.match(type_
)
112 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
113 return m
.group('bits') is not None
115 def type_size(type_
):
116 m
= _TYPE_SPLIT_RE
.match(type_
)
117 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
118 assert m
.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_
)
120 return int(m
.group('bits'))
122 def type_sizes(type_
):
123 if type_has_size(type_
):
124 return [type_size(type_
)]
125 elif type_
== 'bool':
127 elif type_
== 'float':
130 return [1, 8, 16, 32, 64]
132 def type_base_type(type_
):
133 m
= _TYPE_SPLIT_RE
.match(type_
)
134 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
135 return m
.group('type')
137 # Operation where the first two sources are commutative.
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
142 _2src_commutative
= "2src_commutative "
143 associative
= "associative "
145 # global dictionary of opcodes
148 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
149 is_conversion
, algebraic_properties
, const_expr
):
150 assert name
not in opcodes
151 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
152 input_types
, is_conversion
, algebraic_properties
,
155 def unop_convert(name
, out_type
, in_type
, const_expr
):
156 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
158 def unop(name
, ty
, const_expr
):
159 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
161 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
163 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
164 False, "", const_expr
)
166 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
167 reduce_expr
, final_expr
):
169 return "(" + prereduce_expr
.format(src
=src
) + ")"
171 return final_expr
.format(src
="(" + src
+ ")")
172 def reduce_(src0
, src1
):
173 return reduce_expr
.format(src0
=src0
, src1
=src1
)
174 src0
= prereduce("src0.x")
175 src1
= prereduce("src0.y")
176 src2
= prereduce("src0.z")
177 src3
= prereduce("src0.w")
178 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
179 final(reduce_(src0
, src1
)))
180 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
181 final(reduce_(reduce_(src0
, src1
), src2
)))
182 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
183 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
185 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
186 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
188 unop("mov", tuint
, "src0")
190 unop("ineg", tint
, "-src0")
191 unop("fneg", tfloat
, "-src0")
192 unop("inot", tint
, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat
, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat
, "fabs(src0)")
199 unop("fsat", tfloat
, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat
, "exp2f(src0)")
206 unop("flog2", tfloat
, "log2f(src0)")
208 # Generate all of the numeric conversion opcodes
209 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
211 dst_types
= [tfloat
, tint
]
213 dst_types
= [tfloat
, tint
, tbool
]
215 dst_types
= [tfloat
, tuint
]
216 elif src_t
== tfloat
:
217 dst_types
= [tint
, tuint
, tfloat
, tbool
]
219 for dst_t
in dst_types
:
220 for bit_size
in type_sizes(dst_t
):
221 if bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
222 rnd_modes
= ['_rtne', '_rtz', '']
223 for rnd_mode
in rnd_modes
:
224 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0], dst_t
[0],
226 dst_t
+ str(bit_size
), src_t
, "src0")
227 elif bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
229 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
230 dst = _mesa_double_to_float_rtz(src0);
235 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
236 dst_t
+ str(bit_size
), src_t
, conv_expr
)
238 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
239 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
240 dst_t
+ str(bit_size
), src_t
, conv_expr
)
243 # Unary floating-point rounding operations.
246 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
247 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
248 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
249 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
250 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
252 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
254 # Trigonometric operations.
257 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
258 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
261 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
262 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
264 # Partial derivatives.
267 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
268 unop("fddy", tfloat
, "0.0")
269 unop("fddx_fine", tfloat
, "0.0")
270 unop("fddy_fine", tfloat
, "0.0")
271 unop("fddx_coarse", tfloat
, "0.0")
272 unop("fddy_coarse", tfloat
, "0.0")
275 # Floating point pack and unpack operations.
278 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
279 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
280 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
281 """.replace("fmt", fmt
))
284 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
285 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
286 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
287 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
288 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
289 """.replace("fmt", fmt
))
291 def unpack_2x16(fmt
):
292 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
293 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
294 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
295 """.replace("fmt", fmt
))
298 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
299 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
300 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
301 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
302 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
303 """.replace("fmt", fmt
))
317 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
318 dst.x = (src0.x & 0xffff) | (src0.y << 16);
321 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
322 dst.x = (src0.x << 0) |
328 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
329 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
331 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
332 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
334 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
335 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
337 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
338 "dst.x = src0.x; dst.y = src0.x >> 32;")
340 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
341 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
343 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
344 "dst.x = src0.x; dst.y = src0.x >> 16;")
346 # Lowered floating point unpacking operations.
349 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
350 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
351 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
352 "unpack_half_1x16((uint16_t)(src0 >> 16))")
354 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
355 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
357 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
358 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
360 # Bit operations, part of ARB_gpu_shader5.
363 unop("bitfield_reverse", tuint32
, """
364 /* we're not winning any awards for speed here, but that's ok */
366 for (unsigned bit = 0; bit < 32; bit++)
367 dst |= ((src0 >> bit) & 1) << (31 - bit);
369 unop_convert("bit_count", tuint32
, tuint
, """
371 for (unsigned bit = 0; bit < bit_size; bit++) {
372 if ((src0 >> bit) & 1)
377 unop_convert("ufind_msb", tint32
, tuint
, """
379 for (int bit = bit_size - 1; bit >= 0; bit--) {
380 if ((src0 >> bit) & 1) {
387 unop("ifind_msb", tint32
, """
389 for (int bit = 31; bit >= 0; bit--) {
390 /* If src0 < 0, we're looking for the first 0 bit.
391 * if src0 >= 0, we're looking for the first 1 bit.
393 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
394 (!((src0 >> bit) & 1) && (src0 < 0))) {
401 unop_convert("find_lsb", tint32
, tint
, """
403 for (unsigned bit = 0; bit < bit_size; bit++) {
404 if ((src0 >> bit) & 1) {
412 for i
in range(1, 5):
413 for j
in range(1, 5):
414 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
417 # AMD_gcn_shader extended instructions
418 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
420 float absX = fabs(src0.x);
421 float absY = fabs(src0.y);
422 float absZ = fabs(src0.z);
425 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
426 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
427 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
429 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
430 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
431 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
432 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
433 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
434 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
436 dst.x = dst.x / ma + 0.5;
437 dst.y = dst.y / ma + 0.5;
440 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
441 float absX = fabs(src0.x);
442 float absY = fabs(src0.y);
443 float absZ = fabs(src0.z);
444 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
445 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
446 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
447 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
448 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
449 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
452 # Sum of vector components
453 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
455 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
456 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
457 False, alg_props
, const_expr
)
459 def binop(name
, ty
, alg_props
, const_expr
):
460 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
462 def binop_compare(name
, ty
, alg_props
, const_expr
):
463 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
465 def binop_compare32(name
, ty
, alg_props
, const_expr
):
466 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
468 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
469 src2_type
, const_expr
):
470 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
471 False, "", const_expr
)
473 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
474 reduce_expr
, final_expr
):
476 return final_expr
.format(src
= "(" + src
+ ")")
477 def reduce_(src0
, src1
):
478 return reduce_expr
.format(src0
=src0
, src1
=src1
)
479 def prereduce(src0
, src1
):
480 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
481 src0
= prereduce("src0.x", "src1.x")
482 src1
= prereduce("src0.y", "src1.y")
483 src2
= prereduce("src0.z", "src1.z")
484 src3
= prereduce("src0.w", "src1.w")
485 opcode(name
+ "2", output_size
, output_type
,
486 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
487 final(reduce_(src0
, src1
)))
488 opcode(name
+ "3", output_size
, output_type
,
489 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
490 final(reduce_(reduce_(src0
, src1
), src2
)))
491 opcode(name
+ "4", output_size
, output_type
,
492 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
493 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
495 binop("fadd", tfloat
, _2src_commutative
+ associative
, "src0 + src1")
496 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
497 binop("iadd_sat", tint
, _2src_commutative
, """
499 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
500 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
502 binop("uadd_sat", tuint
, _2src_commutative
,
503 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
504 binop("isub_sat", tint
, "", """
506 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
507 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
509 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
511 binop("fsub", tfloat
, "", "src0 - src1")
512 binop("isub", tint
, "", "src0 - src1")
514 binop("fmul", tfloat
, _2src_commutative
+ associative
, "src0 * src1")
515 # low 32-bits of signed/unsigned integer multiply
516 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
518 # Generate 64 bit result from 2 32 bits quantity
519 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
520 "(int64_t)src0 * (int64_t)src1")
521 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
522 "(uint64_t)src0 * (uint64_t)src1")
524 # high 32-bits of signed integer multiply
525 binop("imul_high", tint
, _2src_commutative
, """
526 if (bit_size == 64) {
527 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
528 * extension to work properly. The casts are kind-of annoying but needed
529 * to prevent compiler warnings.
531 uint32_t src0_u32[4] = {
537 uint32_t src1_u32[4] = {
543 uint32_t prod_u32[4];
544 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
545 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
547 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
551 # high 32-bits of unsigned integer multiply
552 binop("umul_high", tuint
, _2src_commutative
, """
553 if (bit_size == 64) {
554 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
555 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
556 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
557 uint32_t prod_u32[4];
558 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
559 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
561 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
565 # low 32-bits of unsigned integer multiply
566 binop("umul_low", tuint32
, _2src_commutative
, """
567 uint64_t mask = (1 << (bit_size / 2)) - 1;
568 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
572 binop("fdiv", tfloat
, "", "src0 / src1")
573 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
574 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
576 # returns a boolean representing the carry resulting from the addition of
577 # the two unsigned arguments.
579 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
581 # returns a boolean representing the borrow resulting from the subtraction
582 # of the two unsigned arguments.
584 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
586 # hadd: (a + b) >> 1 (without overflow)
587 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
588 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
589 # = 2 * (x & y) + (x & ~y) + (~x & y)
590 # = ((x & y) << 1) + (x ^ y)
592 # Since we know that the bottom bit of (x & y) << 1 is zero,
594 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
595 # = (x & y) + ((x ^ y) >> 1)
596 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
597 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
599 # rhadd: (a + b + 1) >> 1 (without overflow)
600 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
601 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
602 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
603 # = ((x | y) << 1) - (x ^ y) + 1
605 # Since we know that the bottom bit of (x & y) << 1 is zero,
607 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
608 # = (x | y) - ((x ^ y) >> 1)
609 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
610 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
612 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
614 # For signed integers, there are several different possible definitions of
615 # "modulus" or "remainder". We follow the conventions used by LLVM and
616 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
617 # operation while the imod opcode implements the more mathematical
618 # "modulus" operation. For details on the difference, see
620 # http://mathforum.org/library/drmath/view/52343.html
622 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
623 binop("imod", tint
, "",
624 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
625 " src0 % src1 : src0 % src1 + src1)")
626 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
627 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
634 # these integer-aware comparisons return a boolean (0 or ~0)
636 binop_compare("flt", tfloat
, "", "src0 < src1")
637 binop_compare("fge", tfloat
, "", "src0 >= src1")
638 binop_compare("feq", tfloat
, _2src_commutative
, "src0 == src1")
639 binop_compare("fne", tfloat
, _2src_commutative
, "src0 != src1")
640 binop_compare("ilt", tint
, "", "src0 < src1")
641 binop_compare("ige", tint
, "", "src0 >= src1")
642 binop_compare("ieq", tint
, _2src_commutative
, "src0 == src1")
643 binop_compare("ine", tint
, _2src_commutative
, "src0 != src1")
644 binop_compare("ult", tuint
, "", "src0 < src1")
645 binop_compare("uge", tuint
, "", "src0 >= src1")
646 binop_compare32("flt32", tfloat
, "", "src0 < src1")
647 binop_compare32("fge32", tfloat
, "", "src0 >= src1")
648 binop_compare32("feq32", tfloat
, _2src_commutative
, "src0 == src1")
649 binop_compare32("fne32", tfloat
, _2src_commutative
, "src0 != src1")
650 binop_compare32("ilt32", tint
, "", "src0 < src1")
651 binop_compare32("ige32", tint
, "", "src0 >= src1")
652 binop_compare32("ieq32", tint
, _2src_commutative
, "src0 == src1")
653 binop_compare32("ine32", tint
, _2src_commutative
, "src0 != src1")
654 binop_compare32("ult32", tuint
, "", "src0 < src1")
655 binop_compare32("uge32", tuint
, "", "src0 >= src1")
657 # integer-aware GLSL-style comparisons that compare floats and ints
659 binop_reduce("ball_fequal", 1, tbool1
, tfloat
, "{src0} == {src1}",
660 "{src0} && {src1}", "{src}")
661 binop_reduce("bany_fnequal", 1, tbool1
, tfloat
, "{src0} != {src1}",
662 "{src0} || {src1}", "{src}")
663 binop_reduce("ball_iequal", 1, tbool1
, tint
, "{src0} == {src1}",
664 "{src0} && {src1}", "{src}")
665 binop_reduce("bany_inequal", 1, tbool1
, tint
, "{src0} != {src1}",
666 "{src0} || {src1}", "{src}")
668 binop_reduce("b32all_fequal", 1, tbool32
, tfloat
, "{src0} == {src1}",
669 "{src0} && {src1}", "{src}")
670 binop_reduce("b32any_fnequal", 1, tbool32
, tfloat
, "{src0} != {src1}",
671 "{src0} || {src1}", "{src}")
672 binop_reduce("b32all_iequal", 1, tbool32
, tint
, "{src0} == {src1}",
673 "{src0} && {src1}", "{src}")
674 binop_reduce("b32any_inequal", 1, tbool32
, tint
, "{src0} != {src1}",
675 "{src0} || {src1}", "{src}")
677 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
679 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
680 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
681 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
682 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
684 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
685 # and false respectively
687 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
688 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
689 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
690 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
692 # SPIRV shifts are undefined for shift-operands >= bitsize,
693 # but SM5 shifts are defined to use the least significant bits, only
694 # The NIR definition is according to the SM5 specification.
695 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
696 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
697 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
698 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
699 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
700 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
702 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
703 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
704 dst = (src0 << (src1 & rotate_mask)) |
705 (src0 >> (-src1 & rotate_mask));
707 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
708 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
709 dst = (src0 >> (src1 & rotate_mask)) |
710 (src0 << (-src1 & rotate_mask));
713 # bitwise logic operators
715 # These are also used as boolean and, or, xor for hardware supporting
719 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
720 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
721 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
724 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
727 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
728 "{src0} * {src1}", "{src0} + {src1}", "{src}")
730 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
731 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
732 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
733 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
735 binop("fmin", tfloat
, "", "fminf(src0, src1)")
736 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
737 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
738 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
739 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
740 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
742 # Saturated vector add for 4 8bit ints.
743 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
745 for (int i = 0; i < 32; i += 8) {
746 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
750 # Saturated vector subtract for 4 8bit ints.
751 binop("ussub_4x8", tint32
, "", """
753 for (int i = 0; i < 32; i += 8) {
754 int src0_chan = (src0 >> i) & 0xff;
755 int src1_chan = (src1 >> i) & 0xff;
756 if (src0_chan > src1_chan)
757 dst |= (src0_chan - src1_chan) << i;
761 # vector min for 4 8bit ints.
762 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
764 for (int i = 0; i < 32; i += 8) {
765 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
769 # vector max for 4 8bit ints.
770 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
772 for (int i = 0; i < 32; i += 8) {
773 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
777 # unorm multiply: (a * b) / 255.
778 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
780 for (int i = 0; i < 32; i += 8) {
781 int src0_chan = (src0 >> i) & 0xff;
782 int src1_chan = (src1 >> i) & 0xff;
783 dst |= ((src0_chan * src1_chan) / 255) << i;
787 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
789 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
790 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
792 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
793 "src0 | ((uint64_t)src1 << 32)")
795 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
796 "src0 | ((uint32_t)src1 << 16)")
798 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
799 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
800 # are from the low five bits of src0 and src1, respectively.
801 binop_convert("bfm", tuint32
, tint32
, "", """
802 int bits = src0 & 0x1F;
803 int offset = src1 & 0x1F;
804 dst = ((1u << bits) - 1) << offset;
807 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
808 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
809 /* flush denormals to zero. */
811 dst = copysignf(0.0f, src0);
814 # Combines the first component of each input to make a 2-component vector.
816 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
822 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
823 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
826 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
827 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
830 def triop(name
, ty
, alg_props
, const_expr
):
831 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
832 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
833 opcode(name
, output_size
, tuint
,
834 [src1_size
, src2_size
, src3_size
],
835 [tuint
, tuint
, tuint
], False, "", const_expr
)
837 triop("ffma", tfloat
, _2src_commutative
, "src0 * src1 + src2")
839 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
843 # A vector conditional select instruction (like ?:, but operating per-
844 # component on vectors). There are two versions, one for floating point
845 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
848 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
851 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
852 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
853 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
855 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
856 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
857 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
859 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
860 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
861 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
863 opcode("bcsel", 0, tuint
, [0, 0, 0],
864 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
865 opcode("b32csel", 0, tuint
, [0, 0, 0],
866 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
869 triop("bfi", tuint32
, "", """
870 unsigned mask = src0, insert = src1, base = src2;
879 dst = (base & ~mask) | (insert & mask);
884 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
886 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
887 opcode("ubfe", 0, tuint32
,
888 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
889 unsigned base = src0;
890 unsigned offset = src1 & 0x1F;
891 unsigned bits = src2 & 0x1F;
894 } else if (offset + bits < 32) {
895 dst = (base << (32 - bits - offset)) >> (32 - bits);
897 dst = base >> offset;
900 opcode("ibfe", 0, tint32
,
901 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
903 unsigned offset = src1 & 0x1F;
904 unsigned bits = src2 & 0x1F;
907 } else if (offset + bits < 32) {
908 dst = (base << (32 - bits - offset)) >> (32 - bits);
910 dst = base >> offset;
914 # GLSL bitfieldExtract()
915 opcode("ubitfield_extract", 0, tuint32
,
916 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
917 unsigned base = src0;
918 int offset = src1, bits = src2;
921 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
922 dst = 0; /* undefined per the spec */
924 dst = (base >> offset) & ((1ull << bits) - 1);
927 opcode("ibitfield_extract", 0, tint32
,
928 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
930 int offset = src1, bits = src2;
933 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
936 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
940 # Combines the first component of each input to make a 3-component vector.
942 triop_horiz("vec3", 3, 1, 1, 1, """
948 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
949 src4_size
, const_expr
):
950 opcode(name
, output_size
, tuint
,
951 [src1_size
, src2_size
, src3_size
, src4_size
],
952 [tuint
, tuint
, tuint
, tuint
],
953 False, "", const_expr
)
955 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
956 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
957 unsigned base = src0, insert = src1;
958 int offset = src2, bits = src3;
961 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
964 unsigned mask = ((1ull << bits) - 1) << offset;
965 dst = (base & ~mask) | ((insert << offset) & mask);
969 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
976 # ir3-specific instruction that maps directly to mul-add shift high mix,
977 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
978 # multiplication (imul) on Freedreno backend..
979 opcode("imadsh_mix16", 1, tint32
,
980 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
981 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;