2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
108 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
110 def type_has_size(type_
):
111 m
= _TYPE_SPLIT_RE
.match(type_
)
112 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
113 return m
.group('bits') is not None
115 def type_size(type_
):
116 m
= _TYPE_SPLIT_RE
.match(type_
)
117 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
118 assert m
.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_
)
120 return int(m
.group('bits'))
122 def type_sizes(type_
):
123 if type_has_size(type_
):
124 return [type_size(type_
)]
125 elif type_
== 'bool':
127 elif type_
== 'float':
130 return [1, 8, 16, 32, 64]
132 def type_base_type(type_
):
133 m
= _TYPE_SPLIT_RE
.match(type_
)
134 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
135 return m
.group('type')
137 # Operation where the first two sources are commutative.
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
142 _2src_commutative
= "2src_commutative "
143 associative
= "associative "
145 # global dictionary of opcodes
148 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
149 is_conversion
, algebraic_properties
, const_expr
):
150 assert name
not in opcodes
151 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
152 input_types
, is_conversion
, algebraic_properties
,
155 def unop_convert(name
, out_type
, in_type
, const_expr
):
156 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
158 def unop(name
, ty
, const_expr
):
159 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
161 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
163 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
164 False, "", const_expr
)
166 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
167 reduce_expr
, final_expr
):
169 return "(" + prereduce_expr
.format(src
=src
) + ")"
171 return final_expr
.format(src
="(" + src
+ ")")
172 def reduce_(src0
, src1
):
173 return reduce_expr
.format(src0
=src0
, src1
=src1
)
174 src0
= prereduce("src0.x")
175 src1
= prereduce("src0.y")
176 src2
= prereduce("src0.z")
177 src3
= prereduce("src0.w")
178 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
179 final(reduce_(src0
, src1
)))
180 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
181 final(reduce_(reduce_(src0
, src1
), src2
)))
182 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
183 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
185 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
186 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
188 unop("mov", tuint
, "src0")
190 unop("ineg", tint
, "-src0")
191 unop("fneg", tfloat
, "-src0")
192 unop("inot", tint
, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat
, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat
, "fabs(src0)")
199 unop("fsat", tfloat
, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat
, "exp2f(src0)")
206 unop("flog2", tfloat
, "log2f(src0)")
208 # Generate all of the numeric conversion opcodes
209 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
211 dst_types
= [tfloat
, tint
]
213 dst_types
= [tfloat
, tint
, tbool
]
215 dst_types
= [tfloat
, tuint
]
216 elif src_t
== tfloat
:
217 dst_types
= [tint
, tuint
, tfloat
, tbool
]
219 for dst_t
in dst_types
:
220 for bit_size
in type_sizes(dst_t
):
221 if bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
222 rnd_modes
= ['_rtne', '_rtz', '']
223 for rnd_mode
in rnd_modes
:
224 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0], dst_t
[0],
226 dst_t
+ str(bit_size
), src_t
, "src0")
228 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
229 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
230 dst_t
+ str(bit_size
), src_t
, conv_expr
)
233 # Unary floating-point rounding operations.
236 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
237 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
238 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
239 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
240 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
242 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
244 # Trigonometric operations.
247 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
248 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
251 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
252 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
254 # Partial derivatives.
257 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
258 unop("fddy", tfloat
, "0.0")
259 unop("fddx_fine", tfloat
, "0.0")
260 unop("fddy_fine", tfloat
, "0.0")
261 unop("fddx_coarse", tfloat
, "0.0")
262 unop("fddy_coarse", tfloat
, "0.0")
265 # Floating point pack and unpack operations.
268 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
269 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
270 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
271 """.replace("fmt", fmt
))
274 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
275 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
276 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
277 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
278 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
279 """.replace("fmt", fmt
))
281 def unpack_2x16(fmt
):
282 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
283 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
284 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
285 """.replace("fmt", fmt
))
288 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
289 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
290 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
291 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
292 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
293 """.replace("fmt", fmt
))
307 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
308 dst.x = (src0.x & 0xffff) | (src0.y << 16);
311 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
312 dst.x = (src0.x << 0) |
318 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
319 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
321 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
322 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
324 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
325 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
327 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
328 "dst.x = src0.x; dst.y = src0.x >> 32;")
330 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
331 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
333 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
334 "dst.x = src0.x; dst.y = src0.x >> 16;")
336 # Lowered floating point unpacking operations.
339 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
340 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
341 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
342 "unpack_half_1x16((uint16_t)(src0 >> 16))")
344 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
345 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
347 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
348 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
350 # Bit operations, part of ARB_gpu_shader5.
353 unop("bitfield_reverse", tuint32
, """
354 /* we're not winning any awards for speed here, but that's ok */
356 for (unsigned bit = 0; bit < 32; bit++)
357 dst |= ((src0 >> bit) & 1) << (31 - bit);
359 unop_convert("bit_count", tuint32
, tuint
, """
361 for (unsigned bit = 0; bit < bit_size; bit++) {
362 if ((src0 >> bit) & 1)
367 unop_convert("ufind_msb", tint32
, tuint
, """
369 for (int bit = bit_size - 1; bit >= 0; bit--) {
370 if ((src0 >> bit) & 1) {
377 unop("ifind_msb", tint32
, """
379 for (int bit = 31; bit >= 0; bit--) {
380 /* If src0 < 0, we're looking for the first 0 bit.
381 * if src0 >= 0, we're looking for the first 1 bit.
383 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
384 (!((src0 >> bit) & 1) && (src0 < 0))) {
391 unop_convert("find_lsb", tint32
, tint
, """
393 for (unsigned bit = 0; bit < bit_size; bit++) {
394 if ((src0 >> bit) & 1) {
402 for i
in range(1, 5):
403 for j
in range(1, 5):
404 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
407 # AMD_gcn_shader extended instructions
408 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
410 float absX = fabs(src0.x);
411 float absY = fabs(src0.y);
412 float absZ = fabs(src0.z);
415 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
416 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
417 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
419 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
420 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
421 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
422 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
423 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
424 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
426 dst.x = dst.x / ma + 0.5;
427 dst.y = dst.y / ma + 0.5;
430 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
431 float absX = fabs(src0.x);
432 float absY = fabs(src0.y);
433 float absZ = fabs(src0.z);
434 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
435 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
436 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
437 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
438 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
439 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
443 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
444 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
445 False, alg_props
, const_expr
)
447 def binop(name
, ty
, alg_props
, const_expr
):
448 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
450 def binop_compare(name
, ty
, alg_props
, const_expr
):
451 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
453 def binop_compare32(name
, ty
, alg_props
, const_expr
):
454 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
456 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
457 src2_type
, const_expr
):
458 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
459 False, "", const_expr
)
461 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
462 reduce_expr
, final_expr
):
464 return final_expr
.format(src
= "(" + src
+ ")")
465 def reduce_(src0
, src1
):
466 return reduce_expr
.format(src0
=src0
, src1
=src1
)
467 def prereduce(src0
, src1
):
468 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
469 src0
= prereduce("src0.x", "src1.x")
470 src1
= prereduce("src0.y", "src1.y")
471 src2
= prereduce("src0.z", "src1.z")
472 src3
= prereduce("src0.w", "src1.w")
473 opcode(name
+ "2", output_size
, output_type
,
474 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
475 final(reduce_(src0
, src1
)))
476 opcode(name
+ "3", output_size
, output_type
,
477 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
478 final(reduce_(reduce_(src0
, src1
), src2
)))
479 opcode(name
+ "4", output_size
, output_type
,
480 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
481 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
483 binop("fadd", tfloat
, _2src_commutative
+ associative
, "src0 + src1")
484 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
485 binop("iadd_sat", tint
, _2src_commutative
, """
487 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
488 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
490 binop("uadd_sat", tuint
, _2src_commutative
,
491 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
492 binop("isub_sat", tint
, "", """
494 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
495 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
497 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
499 binop("fsub", tfloat
, "", "src0 - src1")
500 binop("isub", tint
, "", "src0 - src1")
502 binop("fmul", tfloat
, _2src_commutative
+ associative
, "src0 * src1")
503 # low 32-bits of signed/unsigned integer multiply
504 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
506 # Generate 64 bit result from 2 32 bits quantity
507 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
508 "(int64_t)src0 * (int64_t)src1")
509 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
510 "(uint64_t)src0 * (uint64_t)src1")
512 # high 32-bits of signed integer multiply
513 binop("imul_high", tint
, _2src_commutative
, """
514 if (bit_size == 64) {
515 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
516 * extension to work properly. The casts are kind-of annoying but needed
517 * to prevent compiler warnings.
519 uint32_t src0_u32[4] = {
525 uint32_t src1_u32[4] = {
531 uint32_t prod_u32[4];
532 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
533 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
535 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
539 # high 32-bits of unsigned integer multiply
540 binop("umul_high", tuint
, _2src_commutative
, """
541 if (bit_size == 64) {
542 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
543 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
544 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
545 uint32_t prod_u32[4];
546 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
547 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
549 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
553 # low 32-bits of unsigned integer multiply
554 binop("umul_low", tuint32
, _2src_commutative
, """
555 uint64_t mask = (1 << (bit_size / 2)) - 1;
556 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
560 binop("fdiv", tfloat
, "", "src0 / src1")
561 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
562 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
564 # returns a boolean representing the carry resulting from the addition of
565 # the two unsigned arguments.
567 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
569 # returns a boolean representing the borrow resulting from the subtraction
570 # of the two unsigned arguments.
572 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
574 # hadd: (a + b) >> 1 (without overflow)
575 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
576 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
577 # = 2 * (x & y) + (x & ~y) + (~x & y)
578 # = ((x & y) << 1) + (x ^ y)
580 # Since we know that the bottom bit of (x & y) << 1 is zero,
582 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
583 # = (x & y) + ((x ^ y) >> 1)
584 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
585 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
587 # rhadd: (a + b + 1) >> 1 (without overflow)
588 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
589 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
590 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
591 # = ((x | y) << 1) - (x ^ y) + 1
593 # Since we know that the bottom bit of (x & y) << 1 is zero,
595 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
596 # = (x | y) - ((x ^ y) >> 1)
597 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
598 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
600 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
602 # For signed integers, there are several different possible definitions of
603 # "modulus" or "remainder". We follow the conventions used by LLVM and
604 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
605 # operation while the imod opcode implements the more mathematical
606 # "modulus" operation. For details on the difference, see
608 # http://mathforum.org/library/drmath/view/52343.html
610 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
611 binop("imod", tint
, "",
612 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
613 " src0 % src1 : src0 % src1 + src1)")
614 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
615 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
622 # these integer-aware comparisons return a boolean (0 or ~0)
624 binop_compare("flt", tfloat
, "", "src0 < src1")
625 binop_compare("fge", tfloat
, "", "src0 >= src1")
626 binop_compare("feq", tfloat
, _2src_commutative
, "src0 == src1")
627 binop_compare("fne", tfloat
, _2src_commutative
, "src0 != src1")
628 binop_compare("ilt", tint
, "", "src0 < src1")
629 binop_compare("ige", tint
, "", "src0 >= src1")
630 binop_compare("ieq", tint
, _2src_commutative
, "src0 == src1")
631 binop_compare("ine", tint
, _2src_commutative
, "src0 != src1")
632 binop_compare("ult", tuint
, "", "src0 < src1")
633 binop_compare("uge", tuint
, "", "src0 >= src1")
634 binop_compare32("flt32", tfloat
, "", "src0 < src1")
635 binop_compare32("fge32", tfloat
, "", "src0 >= src1")
636 binop_compare32("feq32", tfloat
, _2src_commutative
, "src0 == src1")
637 binop_compare32("fne32", tfloat
, _2src_commutative
, "src0 != src1")
638 binop_compare32("ilt32", tint
, "", "src0 < src1")
639 binop_compare32("ige32", tint
, "", "src0 >= src1")
640 binop_compare32("ieq32", tint
, _2src_commutative
, "src0 == src1")
641 binop_compare32("ine32", tint
, _2src_commutative
, "src0 != src1")
642 binop_compare32("ult32", tuint
, "", "src0 < src1")
643 binop_compare32("uge32", tuint
, "", "src0 >= src1")
645 # integer-aware GLSL-style comparisons that compare floats and ints
647 binop_reduce("ball_fequal", 1, tbool1
, tfloat
, "{src0} == {src1}",
648 "{src0} && {src1}", "{src}")
649 binop_reduce("bany_fnequal", 1, tbool1
, tfloat
, "{src0} != {src1}",
650 "{src0} || {src1}", "{src}")
651 binop_reduce("ball_iequal", 1, tbool1
, tint
, "{src0} == {src1}",
652 "{src0} && {src1}", "{src}")
653 binop_reduce("bany_inequal", 1, tbool1
, tint
, "{src0} != {src1}",
654 "{src0} || {src1}", "{src}")
656 binop_reduce("b32all_fequal", 1, tbool32
, tfloat
, "{src0} == {src1}",
657 "{src0} && {src1}", "{src}")
658 binop_reduce("b32any_fnequal", 1, tbool32
, tfloat
, "{src0} != {src1}",
659 "{src0} || {src1}", "{src}")
660 binop_reduce("b32all_iequal", 1, tbool32
, tint
, "{src0} == {src1}",
661 "{src0} && {src1}", "{src}")
662 binop_reduce("b32any_inequal", 1, tbool32
, tint
, "{src0} != {src1}",
663 "{src0} || {src1}", "{src}")
665 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
667 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
668 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
669 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
670 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
672 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
673 # and false respectively
675 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
676 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
677 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
678 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
680 # SPIRV shifts are undefined for shift-operands >= bitsize,
681 # but SM5 shifts are defined to use the least significant bits, only
682 # The NIR definition is according to the SM5 specification.
683 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
684 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
685 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
686 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
687 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
688 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
690 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
691 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
692 dst = (src0 << (src1 & rotate_mask)) |
693 (src0 >> (-src1 & rotate_mask));
695 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
696 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
697 dst = (src0 >> (src1 & rotate_mask)) |
698 (src0 << (-src1 & rotate_mask));
701 # bitwise logic operators
703 # These are also used as boolean and, or, xor for hardware supporting
707 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
708 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
709 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
712 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
715 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
716 "{src0} * {src1}", "{src0} + {src1}", "{src}")
718 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
719 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
720 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
721 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
723 binop("fmin", tfloat
, "", "fminf(src0, src1)")
724 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
725 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
726 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
727 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
728 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
730 # Saturated vector add for 4 8bit ints.
731 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
733 for (int i = 0; i < 32; i += 8) {
734 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
738 # Saturated vector subtract for 4 8bit ints.
739 binop("ussub_4x8", tint32
, "", """
741 for (int i = 0; i < 32; i += 8) {
742 int src0_chan = (src0 >> i) & 0xff;
743 int src1_chan = (src1 >> i) & 0xff;
744 if (src0_chan > src1_chan)
745 dst |= (src0_chan - src1_chan) << i;
749 # vector min for 4 8bit ints.
750 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
752 for (int i = 0; i < 32; i += 8) {
753 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
757 # vector max for 4 8bit ints.
758 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
760 for (int i = 0; i < 32; i += 8) {
761 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
765 # unorm multiply: (a * b) / 255.
766 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
768 for (int i = 0; i < 32; i += 8) {
769 int src0_chan = (src0 >> i) & 0xff;
770 int src1_chan = (src1 >> i) & 0xff;
771 dst |= ((src0_chan * src1_chan) / 255) << i;
775 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
777 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
778 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
780 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
781 "src0 | ((uint64_t)src1 << 32)")
783 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
784 "src0 | ((uint32_t)src1 << 16)")
786 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
787 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
788 # are from the low five bits of src0 and src1, respectively.
789 binop_convert("bfm", tuint32
, tint32
, "", """
790 int bits = src0 & 0x1F;
791 int offset = src1 & 0x1F;
792 dst = ((1u << bits) - 1) << offset;
795 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
796 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
797 /* flush denormals to zero. */
799 dst = copysignf(0.0f, src0);
802 # Combines the first component of each input to make a 2-component vector.
804 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
810 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
811 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
814 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
815 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
818 def triop(name
, ty
, alg_props
, const_expr
):
819 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
820 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
821 opcode(name
, output_size
, tuint
,
822 [src1_size
, src2_size
, src3_size
],
823 [tuint
, tuint
, tuint
], False, "", const_expr
)
825 triop("ffma", tfloat
, _2src_commutative
, "src0 * src1 + src2")
827 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
831 # A vector conditional select instruction (like ?:, but operating per-
832 # component on vectors). There are two versions, one for floating point
833 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
836 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
839 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
840 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
841 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
843 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
844 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
845 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
847 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
848 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
849 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
851 opcode("bcsel", 0, tuint
, [0, 0, 0],
852 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
853 opcode("b32csel", 0, tuint
, [0, 0, 0],
854 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
857 triop("bfi", tuint32
, "", """
858 unsigned mask = src0, insert = src1, base = src2;
867 dst = (base & ~mask) | (insert & mask);
872 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
874 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
875 opcode("ubfe", 0, tuint32
,
876 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
877 unsigned base = src0;
878 unsigned offset = src1 & 0x1F;
879 unsigned bits = src2 & 0x1F;
882 } else if (offset + bits < 32) {
883 dst = (base << (32 - bits - offset)) >> (32 - bits);
885 dst = base >> offset;
888 opcode("ibfe", 0, tint32
,
889 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
891 unsigned offset = src1 & 0x1F;
892 unsigned bits = src2 & 0x1F;
895 } else if (offset + bits < 32) {
896 dst = (base << (32 - bits - offset)) >> (32 - bits);
898 dst = base >> offset;
902 # GLSL bitfieldExtract()
903 opcode("ubitfield_extract", 0, tuint32
,
904 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
905 unsigned base = src0;
906 int offset = src1, bits = src2;
909 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
910 dst = 0; /* undefined per the spec */
912 dst = (base >> offset) & ((1ull << bits) - 1);
915 opcode("ibitfield_extract", 0, tint32
,
916 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
918 int offset = src1, bits = src2;
921 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
924 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
928 # Combines the first component of each input to make a 3-component vector.
930 triop_horiz("vec3", 3, 1, 1, 1, """
936 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
937 src4_size
, const_expr
):
938 opcode(name
, output_size
, tuint
,
939 [src1_size
, src2_size
, src3_size
, src4_size
],
940 [tuint
, tuint
, tuint
, tuint
],
941 False, "", const_expr
)
943 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
944 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
945 unsigned base = src0, insert = src1;
946 int offset = src2, bits = src3;
949 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
952 unsigned mask = ((1ull << bits) - 1) << offset;
953 dst = (base & ~mask) | ((insert << offset) & mask);
957 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
964 # ir3-specific instruction that maps directly to mul-add shift high mix,
965 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
966 # multiplication (imul) on Freedreno backend..
967 opcode("imadsh_mix16", 1, tint32
,
968 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
969 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;