1ab4a3e7a315aaba1eb79fa0eb82d03728488b48
2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
108 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
110 def type_has_size(type_
):
111 m
= _TYPE_SPLIT_RE
.match(type_
)
112 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
113 return m
.group('bits') is not None
115 def type_size(type_
):
116 m
= _TYPE_SPLIT_RE
.match(type_
)
117 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
118 assert m
.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_
)
120 return int(m
.group('bits'))
122 def type_sizes(type_
):
123 if type_has_size(type_
):
124 return [type_size(type_
)]
125 elif type_
== 'bool':
127 elif type_
== 'float':
130 return [1, 8, 16, 32, 64]
132 def type_base_type(type_
):
133 m
= _TYPE_SPLIT_RE
.match(type_
)
134 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
135 return m
.group('type')
137 # Operation where the first two sources are commutative.
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
142 _2src_commutative
= "2src_commutative "
143 associative
= "associative "
145 # global dictionary of opcodes
148 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
149 is_conversion
, algebraic_properties
, const_expr
):
150 assert name
not in opcodes
151 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
152 input_types
, is_conversion
, algebraic_properties
,
155 def unop_convert(name
, out_type
, in_type
, const_expr
):
156 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
158 def unop(name
, ty
, const_expr
):
159 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
161 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
163 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
164 False, "", const_expr
)
166 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
167 reduce_expr
, final_expr
):
169 return "(" + prereduce_expr
.format(src
=src
) + ")"
171 return final_expr
.format(src
="(" + src
+ ")")
172 def reduce_(src0
, src1
):
173 return reduce_expr
.format(src0
=src0
, src1
=src1
)
174 src0
= prereduce("src0.x")
175 src1
= prereduce("src0.y")
176 src2
= prereduce("src0.z")
177 src3
= prereduce("src0.w")
178 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
179 final(reduce_(src0
, src1
)))
180 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
181 final(reduce_(reduce_(src0
, src1
), src2
)))
182 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
183 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
185 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
186 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
188 unop("mov", tuint
, "src0")
190 unop("ineg", tint
, "-src0")
191 unop("fneg", tfloat
, "-src0")
192 unop("inot", tint
, "~src0") # invert every bit of the integer
193 unop("fnot", tfloat
, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
194 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
195 unop("fsign", tfloat
, ("bit_size == 64 ? " +
196 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
197 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
198 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
199 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
200 unop("fabs", tfloat
, "fabs(src0)")
201 unop("fsat", tfloat
, ("bit_size == 64 ? " +
202 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
203 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
204 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
205 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
206 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
207 unop("fexp2", tfloat
, "exp2f(src0)")
208 unop("flog2", tfloat
, "log2f(src0)")
210 # Generate all of the numeric conversion opcodes
211 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
213 dst_types
= [tfloat
, tint
]
215 dst_types
= [tfloat
, tint
, tbool
]
217 dst_types
= [tfloat
, tuint
]
218 elif src_t
== tfloat
:
219 dst_types
= [tint
, tuint
, tfloat
, tbool
]
221 for dst_t
in dst_types
:
222 for bit_size
in type_sizes(dst_t
):
223 if bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
224 rnd_modes
= ['_rtne', '_rtz', '']
225 for rnd_mode
in rnd_modes
:
226 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0], dst_t
[0],
228 dst_t
+ str(bit_size
), src_t
, "src0")
230 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
231 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
232 dst_t
+ str(bit_size
), src_t
, conv_expr
)
235 # Unary floating-point rounding operations.
238 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
239 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
240 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
241 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
242 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
244 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
246 # Trigonometric operations.
249 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
250 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
253 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
254 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
256 # Partial derivatives.
259 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
260 unop("fddy", tfloat
, "0.0")
261 unop("fddx_fine", tfloat
, "0.0")
262 unop("fddy_fine", tfloat
, "0.0")
263 unop("fddx_coarse", tfloat
, "0.0")
264 unop("fddy_coarse", tfloat
, "0.0")
267 # Floating point pack and unpack operations.
270 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
271 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
272 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
273 """.replace("fmt", fmt
))
276 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
277 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
278 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
279 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
280 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
281 """.replace("fmt", fmt
))
283 def unpack_2x16(fmt
):
284 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
285 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
286 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
287 """.replace("fmt", fmt
))
290 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
291 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
292 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
293 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
294 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
295 """.replace("fmt", fmt
))
309 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
310 dst.x = (src0.x & 0xffff) | (src0.y << 16);
313 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
314 dst.x = (src0.x << 0) |
320 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
321 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
323 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
324 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
326 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
327 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
329 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
330 "dst.x = src0.x; dst.y = src0.x >> 32;")
332 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
333 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
335 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
336 "dst.x = src0.x; dst.y = src0.x >> 16;")
338 # Lowered floating point unpacking operations.
341 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
342 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
343 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
344 "unpack_half_1x16((uint16_t)(src0 >> 16))")
346 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
347 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
349 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
350 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
352 # Bit operations, part of ARB_gpu_shader5.
355 unop("bitfield_reverse", tuint32
, """
356 /* we're not winning any awards for speed here, but that's ok */
358 for (unsigned bit = 0; bit < 32; bit++)
359 dst |= ((src0 >> bit) & 1) << (31 - bit);
361 unop_convert("bit_count", tuint32
, tuint
, """
363 for (unsigned bit = 0; bit < bit_size; bit++) {
364 if ((src0 >> bit) & 1)
369 unop_convert("ufind_msb", tint32
, tuint
, """
371 for (int bit = bit_size - 1; bit >= 0; bit--) {
372 if ((src0 >> bit) & 1) {
379 unop("ifind_msb", tint32
, """
381 for (int bit = 31; bit >= 0; bit--) {
382 /* If src0 < 0, we're looking for the first 0 bit.
383 * if src0 >= 0, we're looking for the first 1 bit.
385 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
386 (!((src0 >> bit) & 1) && (src0 < 0))) {
393 unop_convert("find_lsb", tint32
, tint
, """
395 for (unsigned bit = 0; bit < bit_size; bit++) {
396 if ((src0 >> bit) & 1) {
404 for i
in range(1, 5):
405 for j
in range(1, 5):
406 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
409 # AMD_gcn_shader extended instructions
410 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
412 float absX = fabs(src0.x);
413 float absY = fabs(src0.y);
414 float absZ = fabs(src0.z);
417 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
418 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
419 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
421 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
422 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
423 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
424 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
425 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
426 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
428 dst.x = dst.x / ma + 0.5;
429 dst.y = dst.y / ma + 0.5;
432 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
433 float absX = fabs(src0.x);
434 float absY = fabs(src0.y);
435 float absZ = fabs(src0.z);
436 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
437 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
438 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
439 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
440 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
441 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
445 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
446 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
447 False, alg_props
, const_expr
)
449 def binop(name
, ty
, alg_props
, const_expr
):
450 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
452 def binop_compare(name
, ty
, alg_props
, const_expr
):
453 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
455 def binop_compare32(name
, ty
, alg_props
, const_expr
):
456 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
458 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
459 src2_type
, const_expr
):
460 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
461 False, "", const_expr
)
463 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
464 reduce_expr
, final_expr
):
466 return final_expr
.format(src
= "(" + src
+ ")")
467 def reduce_(src0
, src1
):
468 return reduce_expr
.format(src0
=src0
, src1
=src1
)
469 def prereduce(src0
, src1
):
470 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
471 src0
= prereduce("src0.x", "src1.x")
472 src1
= prereduce("src0.y", "src1.y")
473 src2
= prereduce("src0.z", "src1.z")
474 src3
= prereduce("src0.w", "src1.w")
475 opcode(name
+ "2", output_size
, output_type
,
476 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
477 final(reduce_(src0
, src1
)))
478 opcode(name
+ "3", output_size
, output_type
,
479 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
480 final(reduce_(reduce_(src0
, src1
), src2
)))
481 opcode(name
+ "4", output_size
, output_type
,
482 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
483 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
485 binop("fadd", tfloat
, _2src_commutative
+ associative
, "src0 + src1")
486 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
487 binop("iadd_sat", tint
, _2src_commutative
, """
489 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
490 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
492 binop("uadd_sat", tuint
, _2src_commutative
,
493 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
494 binop("isub_sat", tint
, "", """
496 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
497 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
499 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
501 binop("fsub", tfloat
, "", "src0 - src1")
502 binop("isub", tint
, "", "src0 - src1")
504 binop("fmul", tfloat
, _2src_commutative
+ associative
, "src0 * src1")
505 # low 32-bits of signed/unsigned integer multiply
506 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
508 # Generate 64 bit result from 2 32 bits quantity
509 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
510 "(int64_t)src0 * (int64_t)src1")
511 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
512 "(uint64_t)src0 * (uint64_t)src1")
514 # high 32-bits of signed integer multiply
515 binop("imul_high", tint
, _2src_commutative
, """
516 if (bit_size == 64) {
517 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
518 * extension to work properly. The casts are kind-of annoying but needed
519 * to prevent compiler warnings.
521 uint32_t src0_u32[4] = {
527 uint32_t src1_u32[4] = {
533 uint32_t prod_u32[4];
534 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
535 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
537 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
541 # high 32-bits of unsigned integer multiply
542 binop("umul_high", tuint
, _2src_commutative
, """
543 if (bit_size == 64) {
544 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
545 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
546 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
547 uint32_t prod_u32[4];
548 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
549 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
551 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
555 # low 32-bits of unsigned integer multiply
556 binop("umul_low", tuint32
, _2src_commutative
, """
557 uint64_t mask = (1 << (bit_size / 2)) - 1;
558 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
562 binop("fdiv", tfloat
, "", "src0 / src1")
563 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
564 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
566 # returns a boolean representing the carry resulting from the addition of
567 # the two unsigned arguments.
569 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
571 # returns a boolean representing the borrow resulting from the subtraction
572 # of the two unsigned arguments.
574 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
576 # hadd: (a + b) >> 1 (without overflow)
577 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
578 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
579 # = 2 * (x & y) + (x & ~y) + (~x & y)
580 # = ((x & y) << 1) + (x ^ y)
582 # Since we know that the bottom bit of (x & y) << 1 is zero,
584 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
585 # = (x & y) + ((x ^ y) >> 1)
586 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
587 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
589 # rhadd: (a + b + 1) >> 1 (without overflow)
590 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
591 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
592 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
593 # = ((x | y) << 1) - (x ^ y) + 1
595 # Since we know that the bottom bit of (x & y) << 1 is zero,
597 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
598 # = (x | y) - ((x ^ y) >> 1)
599 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
600 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
602 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
604 # For signed integers, there are several different possible definitions of
605 # "modulus" or "remainder". We follow the conventions used by LLVM and
606 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
607 # operation while the imod opcode implements the more mathematical
608 # "modulus" operation. For details on the difference, see
610 # http://mathforum.org/library/drmath/view/52343.html
612 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
613 binop("imod", tint
, "",
614 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
615 " src0 % src1 : src0 % src1 + src1)")
616 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
617 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
624 # these integer-aware comparisons return a boolean (0 or ~0)
626 binop_compare("flt", tfloat
, "", "src0 < src1")
627 binop_compare("fge", tfloat
, "", "src0 >= src1")
628 binop_compare("feq", tfloat
, _2src_commutative
, "src0 == src1")
629 binop_compare("fne", tfloat
, _2src_commutative
, "src0 != src1")
630 binop_compare("ilt", tint
, "", "src0 < src1")
631 binop_compare("ige", tint
, "", "src0 >= src1")
632 binop_compare("ieq", tint
, _2src_commutative
, "src0 == src1")
633 binop_compare("ine", tint
, _2src_commutative
, "src0 != src1")
634 binop_compare("ult", tuint
, "", "src0 < src1")
635 binop_compare("uge", tuint
, "", "src0 >= src1")
636 binop_compare32("flt32", tfloat
, "", "src0 < src1")
637 binop_compare32("fge32", tfloat
, "", "src0 >= src1")
638 binop_compare32("feq32", tfloat
, _2src_commutative
, "src0 == src1")
639 binop_compare32("fne32", tfloat
, _2src_commutative
, "src0 != src1")
640 binop_compare32("ilt32", tint
, "", "src0 < src1")
641 binop_compare32("ige32", tint
, "", "src0 >= src1")
642 binop_compare32("ieq32", tint
, _2src_commutative
, "src0 == src1")
643 binop_compare32("ine32", tint
, _2src_commutative
, "src0 != src1")
644 binop_compare32("ult32", tuint
, "", "src0 < src1")
645 binop_compare32("uge32", tuint
, "", "src0 >= src1")
647 # integer-aware GLSL-style comparisons that compare floats and ints
649 binop_reduce("ball_fequal", 1, tbool1
, tfloat
, "{src0} == {src1}",
650 "{src0} && {src1}", "{src}")
651 binop_reduce("bany_fnequal", 1, tbool1
, tfloat
, "{src0} != {src1}",
652 "{src0} || {src1}", "{src}")
653 binop_reduce("ball_iequal", 1, tbool1
, tint
, "{src0} == {src1}",
654 "{src0} && {src1}", "{src}")
655 binop_reduce("bany_inequal", 1, tbool1
, tint
, "{src0} != {src1}",
656 "{src0} || {src1}", "{src}")
658 binop_reduce("b32all_fequal", 1, tbool32
, tfloat
, "{src0} == {src1}",
659 "{src0} && {src1}", "{src}")
660 binop_reduce("b32any_fnequal", 1, tbool32
, tfloat
, "{src0} != {src1}",
661 "{src0} || {src1}", "{src}")
662 binop_reduce("b32all_iequal", 1, tbool32
, tint
, "{src0} == {src1}",
663 "{src0} && {src1}", "{src}")
664 binop_reduce("b32any_inequal", 1, tbool32
, tint
, "{src0} != {src1}",
665 "{src0} || {src1}", "{src}")
667 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
669 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
670 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
671 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
672 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
674 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
675 # and false respectively
677 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
678 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
679 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
680 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
682 # SPIRV shifts are undefined for shift-operands >= bitsize,
683 # but SM5 shifts are defined to use the least significant bits, only
684 # The NIR definition is according to the SM5 specification.
685 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
686 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
687 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
688 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
689 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
690 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
692 # bitwise logic operators
694 # These are also used as boolean and, or, xor for hardware supporting
698 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
699 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
700 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
703 # floating point logic operators
705 # These use (src != 0.0) for testing the truth of the input, and output 1.0
706 # for true and 0.0 for false
708 binop("fand", tfloat32
, _2src_commutative
,
709 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
710 binop("for", tfloat32
, _2src_commutative
,
711 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
712 binop("fxor", tfloat32
, _2src_commutative
,
713 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
715 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
718 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
719 "{src0} * {src1}", "{src0} + {src1}", "{src}")
721 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
722 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
723 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
724 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
726 binop("fmin", tfloat
, "", "fminf(src0, src1)")
727 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
728 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
729 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
730 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
731 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
733 # Saturated vector add for 4 8bit ints.
734 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
736 for (int i = 0; i < 32; i += 8) {
737 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
741 # Saturated vector subtract for 4 8bit ints.
742 binop("ussub_4x8", tint32
, "", """
744 for (int i = 0; i < 32; i += 8) {
745 int src0_chan = (src0 >> i) & 0xff;
746 int src1_chan = (src1 >> i) & 0xff;
747 if (src0_chan > src1_chan)
748 dst |= (src0_chan - src1_chan) << i;
752 # vector min for 4 8bit ints.
753 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
755 for (int i = 0; i < 32; i += 8) {
756 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
760 # vector max for 4 8bit ints.
761 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
763 for (int i = 0; i < 32; i += 8) {
764 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
768 # unorm multiply: (a * b) / 255.
769 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
771 for (int i = 0; i < 32; i += 8) {
772 int src0_chan = (src0 >> i) & 0xff;
773 int src1_chan = (src1 >> i) & 0xff;
774 dst |= ((src0_chan * src1_chan) / 255) << i;
778 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
780 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
781 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
783 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
784 "src0 | ((uint64_t)src1 << 32)")
786 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
787 "src0 | ((uint32_t)src1 << 16)")
789 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
790 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
791 # if either of its arguments are 32.
792 binop_convert("bfm", tuint32
, tint32
, "", """
793 int bits = src0, offset = src1;
794 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
795 dst = 0; /* undefined */
797 dst = ((1u << bits) - 1) << offset;
800 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
801 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
802 /* flush denormals to zero. */
804 dst = copysignf(0.0f, src0);
807 # Combines the first component of each input to make a 2-component vector.
809 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
815 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
816 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
819 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
820 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
823 def triop(name
, ty
, alg_props
, const_expr
):
824 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
825 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
826 opcode(name
, output_size
, tuint
,
827 [src1_size
, src2_size
, src3_size
],
828 [tuint
, tuint
, tuint
], False, "", const_expr
)
830 triop("ffma", tfloat
, _2src_commutative
, "src0 * src1 + src2")
832 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
836 # A vector conditional select instruction (like ?:, but operating per-
837 # component on vectors). There are two versions, one for floating point
838 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
841 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
844 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
845 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
846 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
848 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
849 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
850 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
852 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
853 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
854 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
856 opcode("bcsel", 0, tuint
, [0, 0, 0],
857 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
858 opcode("b32csel", 0, tuint
, [0, 0, 0],
859 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
862 triop("bfi", tuint32
, "", """
863 unsigned mask = src0, insert = src1, base = src2;
872 dst = (base & ~mask) | (insert & mask);
876 # SM5 ubfe/ibfe assembly
877 opcode("ubfe", 0, tuint32
,
878 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
879 unsigned base = src0;
880 int offset = src1, bits = src2;
883 } else if (bits < 0 || offset < 0) {
884 dst = 0; /* undefined */
885 } else if (offset + bits < 32) {
886 dst = (base << (32 - bits - offset)) >> (32 - bits);
888 dst = base >> offset;
891 opcode("ibfe", 0, tint32
,
892 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
894 int offset = src1, bits = src2;
897 } else if (bits < 0 || offset < 0) {
898 dst = 0; /* undefined */
899 } else if (offset + bits < 32) {
900 dst = (base << (32 - bits - offset)) >> (32 - bits);
902 dst = base >> offset;
906 # GLSL bitfieldExtract()
907 opcode("ubitfield_extract", 0, tuint32
,
908 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
909 unsigned base = src0;
910 int offset = src1, bits = src2;
913 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
914 dst = 0; /* undefined per the spec */
916 dst = (base >> offset) & ((1ull << bits) - 1);
919 opcode("ibitfield_extract", 0, tint32
,
920 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
922 int offset = src1, bits = src2;
925 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
928 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
932 # Combines the first component of each input to make a 3-component vector.
934 triop_horiz("vec3", 3, 1, 1, 1, """
940 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
941 src4_size
, const_expr
):
942 opcode(name
, output_size
, tuint
,
943 [src1_size
, src2_size
, src3_size
, src4_size
],
944 [tuint
, tuint
, tuint
, tuint
],
945 False, "", const_expr
)
947 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
948 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
949 unsigned base = src0, insert = src1;
950 int offset = src2, bits = src3;
953 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
956 unsigned mask = ((1ull << bits) - 1) << offset;
957 dst = (base & ~mask) | ((insert << offset) & mask);
961 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
968 # ir3-specific instruction that maps directly to mul-add shift high mix,
969 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
970 # multiplication (imul) on Freedreno backend..
971 opcode("imadsh_mix16", 1, tint32
,
972 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
973 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;