2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4 or (output_size
== 8) or (output_size
== 16)
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
110 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
112 def type_has_size(type_
):
113 m
= _TYPE_SPLIT_RE
.match(type_
)
114 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
115 return m
.group('bits') is not None
117 def type_size(type_
):
118 m
= _TYPE_SPLIT_RE
.match(type_
)
119 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
120 assert m
.group('bits') is not None, \
121 'NIR type string has no bit size: "{}"'.format(type_
)
122 return int(m
.group('bits'))
124 def type_sizes(type_
):
125 if type_has_size(type_
):
126 return [type_size(type_
)]
127 elif type_
== 'bool':
128 return [1, 8, 16, 32]
129 elif type_
== 'float':
132 return [1, 8, 16, 32, 64]
134 def type_base_type(type_
):
135 m
= _TYPE_SPLIT_RE
.match(type_
)
136 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
137 return m
.group('type')
139 # Operation where the first two sources are commutative.
141 # For 2-source operations, this just mathematical commutativity. Some
142 # 3-source operations, like ffma, are only commutative in the first two
144 _2src_commutative
= "2src_commutative "
145 associative
= "associative "
147 # global dictionary of opcodes
150 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
151 is_conversion
, algebraic_properties
, const_expr
):
152 assert name
not in opcodes
153 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
154 input_types
, is_conversion
, algebraic_properties
,
157 def unop_convert(name
, out_type
, in_type
, const_expr
):
158 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
160 def unop(name
, ty
, const_expr
):
161 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
163 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
165 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
166 False, "", const_expr
)
168 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
169 reduce_expr
, final_expr
):
171 return "(" + prereduce_expr
.format(src
=src
) + ")"
173 return final_expr
.format(src
="(" + src
+ ")")
174 def reduce_(src0
, src1
):
175 return reduce_expr
.format(src0
=src0
, src1
=src1
)
176 src0
= prereduce("src0.x")
177 src1
= prereduce("src0.y")
178 src2
= prereduce("src0.z")
179 src3
= prereduce("src0.w")
180 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
181 final(reduce_(src0
, src1
)))
182 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
183 final(reduce_(reduce_(src0
, src1
), src2
)))
184 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
185 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
187 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
188 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
190 unop("mov", tuint
, "src0")
192 unop("ineg", tint
, "-src0")
193 unop("fneg", tfloat
, "-src0")
194 unop("inot", tint
, "~src0") # invert every bit of the integer
195 unop("fsign", tfloat
, ("bit_size == 64 ? " +
196 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
197 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
198 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
199 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
200 unop("fabs", tfloat
, "fabs(src0)")
201 unop("fsat", tfloat
, ("bit_size == 64 ? " +
202 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
203 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
204 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
205 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
206 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
207 unop("fexp2", tfloat
, "exp2f(src0)")
208 unop("flog2", tfloat
, "log2f(src0)")
210 # Generate all of the numeric conversion opcodes
211 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
213 dst_types
= [tfloat
, tint
]
215 dst_types
= [tfloat
, tint
, tbool
]
217 dst_types
= [tfloat
, tuint
]
218 elif src_t
== tfloat
:
219 dst_types
= [tint
, tuint
, tfloat
, tbool
]
221 for dst_t
in dst_types
:
222 for dst_bit_size
in type_sizes(dst_t
):
223 if dst_bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
224 rnd_modes
= ['_rtne', '_rtz', '']
225 for rnd_mode
in rnd_modes
:
226 if rnd_mode
== '_rtne':
229 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
234 elif rnd_mode
== '_rtz':
237 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
245 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0],
249 dst_t
+ str(dst_bit_size
),
251 elif dst_bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
253 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
254 dst = _mesa_double_to_float_rtz(src0);
259 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
261 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
263 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
264 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
266 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
269 # Unary floating-point rounding operations.
272 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
273 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
274 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
275 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
276 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
278 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
280 # Trigonometric operations.
283 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
284 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
287 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
288 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
290 # Partial derivatives.
293 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
294 unop("fddy", tfloat
, "0.0")
295 unop("fddx_fine", tfloat
, "0.0")
296 unop("fddy_fine", tfloat
, "0.0")
297 unop("fddx_coarse", tfloat
, "0.0")
298 unop("fddy_coarse", tfloat
, "0.0")
301 # Floating point pack and unpack operations.
304 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
305 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
306 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
307 """.replace("fmt", fmt
))
310 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
311 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
313 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
314 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
315 """.replace("fmt", fmt
))
317 def unpack_2x16(fmt
):
318 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
319 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
320 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
321 """.replace("fmt", fmt
))
324 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
325 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
326 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
327 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
328 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
329 """.replace("fmt", fmt
))
343 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
344 dst.x = (src0.x & 0xffff) | (src0.y << 16);
347 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
348 dst.x = (src0.x << 0) |
354 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
355 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
357 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
358 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
360 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
361 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
363 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
364 "dst.x = src0.x; dst.y = src0.x >> 32;")
366 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
367 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
369 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
370 "dst.x = src0.x; dst.y = src0.x >> 16;")
372 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32
, 1, tuint32
, """
373 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
374 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
377 # Lowered floating point unpacking operations.
379 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
380 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
381 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
382 "unpack_half_1x16((uint16_t)(src0 >> 16))")
384 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32
, tuint32
,
385 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
386 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32
, tuint32
,
387 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
389 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
390 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
392 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
393 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
395 # Bit operations, part of ARB_gpu_shader5.
398 unop("bitfield_reverse", tuint32
, """
399 /* we're not winning any awards for speed here, but that's ok */
401 for (unsigned bit = 0; bit < 32; bit++)
402 dst |= ((src0 >> bit) & 1) << (31 - bit);
404 unop_convert("bit_count", tuint32
, tuint
, """
406 for (unsigned bit = 0; bit < bit_size; bit++) {
407 if ((src0 >> bit) & 1)
412 unop_convert("ufind_msb", tint32
, tuint
, """
414 for (int bit = bit_size - 1; bit >= 0; bit--) {
415 if ((src0 >> bit) & 1) {
422 unop("uclz", tuint32
, """
424 for (bit = bit_size - 1; bit >= 0; bit--) {
425 if ((src0 & (1u << bit)) != 0)
428 dst = (unsigned)(31 - bit);
431 unop("ifind_msb", tint32
, """
433 for (int bit = 31; bit >= 0; bit--) {
434 /* If src0 < 0, we're looking for the first 0 bit.
435 * if src0 >= 0, we're looking for the first 1 bit.
437 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
438 (!((src0 >> bit) & 1) && (src0 < 0))) {
445 unop_convert("find_lsb", tint32
, tint
, """
447 for (unsigned bit = 0; bit < bit_size; bit++) {
448 if ((src0 >> bit) & 1) {
456 for i
in range(1, 5):
457 for j
in range(1, 5):
458 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
461 # AMD_gcn_shader extended instructions
462 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
464 float absX = fabs(src0.x);
465 float absY = fabs(src0.y);
466 float absZ = fabs(src0.z);
469 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
470 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
471 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
473 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
474 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
475 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
476 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
477 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
478 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
480 dst.x = dst.x / ma + 0.5;
481 dst.y = dst.y / ma + 0.5;
484 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
485 float absX = fabs(src0.x);
486 float absY = fabs(src0.y);
487 float absZ = fabs(src0.z);
488 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
489 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
490 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
491 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
492 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
493 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
496 # Sum of vector components
497 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
499 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
500 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
501 False, alg_props
, const_expr
)
503 def binop(name
, ty
, alg_props
, const_expr
):
504 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
506 def binop_compare(name
, ty
, alg_props
, const_expr
):
507 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
509 def binop_compare8(name
, ty
, alg_props
, const_expr
):
510 binop_convert(name
, tbool8
, ty
, alg_props
, const_expr
)
512 def binop_compare16(name
, ty
, alg_props
, const_expr
):
513 binop_convert(name
, tbool16
, ty
, alg_props
, const_expr
)
515 def binop_compare32(name
, ty
, alg_props
, const_expr
):
516 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
518 def binop_compare_all_sizes(name
, ty
, alg_props
, const_expr
):
519 binop_compare(name
, ty
, alg_props
, const_expr
)
520 binop_compare8(name
+ "8", ty
, alg_props
, const_expr
)
521 binop_compare16(name
+ "16", ty
, alg_props
, const_expr
)
522 binop_compare32(name
+ "32", ty
, alg_props
, const_expr
)
524 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
525 src2_type
, const_expr
):
526 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
527 False, "", const_expr
)
529 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
530 reduce_expr
, final_expr
):
532 return final_expr
.format(src
= "(" + src
+ ")")
533 def reduce_(src0
, src1
):
534 return reduce_expr
.format(src0
=src0
, src1
=src1
)
535 def prereduce(src0
, src1
):
536 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
537 src0
= prereduce("src0.x", "src1.x")
538 src1
= prereduce("src0.y", "src1.y")
539 src2
= prereduce("src0.z", "src1.z")
540 src3
= prereduce("src0.w", "src1.w")
541 opcode(name
+ "2", output_size
, output_type
,
542 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
543 final(reduce_(src0
, src1
)))
544 opcode(name
+ "3", output_size
, output_type
,
545 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
546 final(reduce_(reduce_(src0
, src1
), src2
)))
547 opcode(name
+ "4", output_size
, output_type
,
548 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
549 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
551 def binop_reduce_all_sizes(name
, output_size
, src_type
, prereduce_expr
,
552 reduce_expr
, final_expr
):
553 binop_reduce(name
, output_size
, tbool1
, src_type
,
554 prereduce_expr
, reduce_expr
, final_expr
)
555 binop_reduce("b8" + name
[1:], output_size
, tbool8
, src_type
,
556 prereduce_expr
, reduce_expr
, final_expr
)
557 binop_reduce("b16" + name
[1:], output_size
, tbool16
, src_type
,
558 prereduce_expr
, reduce_expr
, final_expr
)
559 binop_reduce("b32" + name
[1:], output_size
, tbool32
, src_type
,
560 prereduce_expr
, reduce_expr
, final_expr
)
562 binop("fadd", tfloat
, _2src_commutative
+ associative
,"""
563 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
565 dst = _mesa_double_add_rtz(src0, src1);
567 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
572 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
573 binop("iadd_sat", tint
, _2src_commutative
, """
575 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
576 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
578 binop("uadd_sat", tuint
, _2src_commutative
,
579 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
580 binop("isub_sat", tint
, "", """
582 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
583 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
585 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
587 binop("fsub", tfloat
, "", """
588 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
590 dst = _mesa_double_sub_rtz(src0, src1);
592 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
597 binop("isub", tint
, "", "src0 - src1")
598 binop_convert("uabs_isub", tuint
, tint
, "", """
599 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
600 : (uint64_t) src0 - (uint64_t) src1
602 binop("uabs_usub", tuint
, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
604 binop("fmul", tfloat
, _2src_commutative
+ associative
, """
605 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
607 dst = _mesa_double_mul_rtz(src0, src1);
609 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
614 # low 32-bits of signed/unsigned integer multiply
615 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
617 # Generate 64 bit result from 2 32 bits quantity
618 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
619 "(int64_t)src0 * (int64_t)src1")
620 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
621 "(uint64_t)src0 * (uint64_t)src1")
623 # high 32-bits of signed integer multiply
624 binop("imul_high", tint
, _2src_commutative
, """
625 if (bit_size == 64) {
626 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
627 * extension to work properly. The casts are kind-of annoying but needed
628 * to prevent compiler warnings.
630 uint32_t src0_u32[4] = {
636 uint32_t src1_u32[4] = {
642 uint32_t prod_u32[4];
643 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
644 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
646 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
650 # high 32-bits of unsigned integer multiply
651 binop("umul_high", tuint
, _2src_commutative
, """
652 if (bit_size == 64) {
653 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
654 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
655 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
656 uint32_t prod_u32[4];
657 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
658 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
660 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
664 # low 32-bits of unsigned integer multiply
665 binop("umul_low", tuint32
, _2src_commutative
, """
666 uint64_t mask = (1 << (bit_size / 2)) - 1;
667 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
670 # Multiply 32-bits with low 16-bits.
671 binop("imul_32x16", tint32
, "", "src0 * (int16_t) src1")
672 binop("umul_32x16", tuint32
, "", "src0 * (uint16_t) src1")
674 binop("fdiv", tfloat
, "", "src0 / src1")
675 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
676 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
678 # returns a boolean representing the carry resulting from the addition of
679 # the two unsigned arguments.
681 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
683 # returns a boolean representing the borrow resulting from the subtraction
684 # of the two unsigned arguments.
686 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
688 # hadd: (a + b) >> 1 (without overflow)
689 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
690 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
691 # = 2 * (x & y) + (x & ~y) + (~x & y)
692 # = ((x & y) << 1) + (x ^ y)
694 # Since we know that the bottom bit of (x & y) << 1 is zero,
696 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
697 # = (x & y) + ((x ^ y) >> 1)
698 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
699 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
701 # rhadd: (a + b + 1) >> 1 (without overflow)
702 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
703 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
704 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
705 # = ((x | y) << 1) - (x ^ y) + 1
707 # Since we know that the bottom bit of (x & y) << 1 is zero,
709 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
710 # = (x | y) - ((x ^ y) >> 1)
711 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
712 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
714 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
716 # For signed integers, there are several different possible definitions of
717 # "modulus" or "remainder". We follow the conventions used by LLVM and
718 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
719 # operation while the imod opcode implements the more mathematical
720 # "modulus" operation. For details on the difference, see
722 # http://mathforum.org/library/drmath/view/52343.html
724 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
725 binop("imod", tint
, "",
726 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
727 " src0 % src1 : src0 % src1 + src1)")
728 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
729 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
736 # these integer-aware comparisons return a boolean (0 or ~0)
738 binop_compare_all_sizes("flt", tfloat
, "", "src0 < src1")
739 binop_compare_all_sizes("fge", tfloat
, "", "src0 >= src1")
740 binop_compare_all_sizes("feq", tfloat
, _2src_commutative
, "src0 == src1")
741 binop_compare_all_sizes("fne", tfloat
, _2src_commutative
, "src0 != src1")
742 binop_compare_all_sizes("ilt", tint
, "", "src0 < src1")
743 binop_compare_all_sizes("ige", tint
, "", "src0 >= src1")
744 binop_compare_all_sizes("ieq", tint
, _2src_commutative
, "src0 == src1")
745 binop_compare_all_sizes("ine", tint
, _2src_commutative
, "src0 != src1")
746 binop_compare_all_sizes("ult", tuint
, "", "src0 < src1")
747 binop_compare_all_sizes("uge", tuint
, "", "src0 >= src1")
749 # integer-aware GLSL-style comparisons that compare floats and ints
751 binop_reduce_all_sizes("ball_fequal", 1, tfloat
, "{src0} == {src1}",
752 "{src0} && {src1}", "{src}")
753 binop_reduce_all_sizes("bany_fnequal", 1, tfloat
, "{src0} != {src1}",
754 "{src0} || {src1}", "{src}")
755 binop_reduce_all_sizes("ball_iequal", 1, tint
, "{src0} == {src1}",
756 "{src0} && {src1}", "{src}")
757 binop_reduce_all_sizes("bany_inequal", 1, tint
, "{src0} != {src1}",
758 "{src0} || {src1}", "{src}")
760 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
762 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
763 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
764 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
765 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
767 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
768 # and false respectively
770 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
771 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
772 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
773 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
775 # SPIRV shifts are undefined for shift-operands >= bitsize,
776 # but SM5 shifts are defined to use the least significant bits, only
777 # The NIR definition is according to the SM5 specification.
778 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
779 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
780 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
781 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
782 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
783 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
785 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
786 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
787 dst = (src0 << (src1 & rotate_mask)) |
788 (src0 >> (-src1 & rotate_mask));
790 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
791 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
792 dst = (src0 >> (src1 & rotate_mask)) |
793 (src0 << (-src1 & rotate_mask));
796 # bitwise logic operators
798 # These are also used as boolean and, or, xor for hardware supporting
802 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
803 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
804 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
807 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
810 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
811 "{src0} * {src1}", "{src0} + {src1}", "{src}")
813 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
814 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
815 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
816 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
818 binop("fmin", tfloat
, "", "fmin(src0, src1)")
819 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
820 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
821 binop("fmax", tfloat
, "", "fmax(src0, src1)")
822 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
823 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
825 # Saturated vector add for 4 8bit ints.
826 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
828 for (int i = 0; i < 32; i += 8) {
829 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
833 # Saturated vector subtract for 4 8bit ints.
834 binop("ussub_4x8", tint32
, "", """
836 for (int i = 0; i < 32; i += 8) {
837 int src0_chan = (src0 >> i) & 0xff;
838 int src1_chan = (src1 >> i) & 0xff;
839 if (src0_chan > src1_chan)
840 dst |= (src0_chan - src1_chan) << i;
844 # vector min for 4 8bit ints.
845 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
847 for (int i = 0; i < 32; i += 8) {
848 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
852 # vector max for 4 8bit ints.
853 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
855 for (int i = 0; i < 32; i += 8) {
856 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
860 # unorm multiply: (a * b) / 255.
861 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
863 for (int i = 0; i < 32; i += 8) {
864 int src0_chan = (src0 >> i) & 0xff;
865 int src1_chan = (src1 >> i) & 0xff;
866 dst |= ((src0_chan * src1_chan) / 255) << i;
870 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
872 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
873 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
875 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
876 "src0 | ((uint64_t)src1 << 32)")
878 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
879 "src0 | ((uint32_t)src1 << 16)")
881 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
882 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
883 # are from the low five bits of src0 and src1, respectively.
884 binop_convert("bfm", tuint32
, tint32
, "", """
885 int bits = src0 & 0x1F;
886 int offset = src1 & 0x1F;
887 dst = ((1u << bits) - 1) << offset;
890 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
891 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
892 /* flush denormals to zero. */
894 dst = copysignf(0.0f, src0);
897 # Combines the first component of each input to make a 2-component vector.
899 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
905 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
906 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
909 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
910 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
913 def triop(name
, ty
, alg_props
, const_expr
):
914 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
915 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
916 opcode(name
, output_size
, tuint
,
917 [src1_size
, src2_size
, src3_size
],
918 [tuint
, tuint
, tuint
], False, "", const_expr
)
920 triop("ffma", tfloat
, _2src_commutative
, """
921 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
923 dst = _mesa_double_fma_rtz(src0, src1, src2);
924 else if (bit_size == 32)
925 dst = _mesa_float_fma_rtz(src0, src1, src2);
927 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
930 dst = fmaf(src0, src1, src2);
932 dst = fma(src0, src1, src2);
936 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
940 # A vector conditional select instruction (like ?:, but operating per-
941 # component on vectors). There are two versions, one for floating point
942 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
945 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
948 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
949 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
950 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
952 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
953 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
954 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
956 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
957 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
958 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
960 opcode("bcsel", 0, tuint
, [0, 0, 0],
961 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
962 opcode("b8csel", 0, tuint
, [0, 0, 0],
963 [tbool8
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
964 opcode("b16csel", 0, tuint
, [0, 0, 0],
965 [tbool16
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
966 opcode("b32csel", 0, tuint
, [0, 0, 0],
967 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
970 triop("bfi", tuint32
, "", """
971 unsigned mask = src0, insert = src1, base = src2;
980 dst = (base & ~mask) | (insert & mask);
985 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
987 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
988 opcode("ubfe", 0, tuint32
,
989 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
990 unsigned base = src0;
991 unsigned offset = src1 & 0x1F;
992 unsigned bits = src2 & 0x1F;
995 } else if (offset + bits < 32) {
996 dst = (base << (32 - bits - offset)) >> (32 - bits);
998 dst = base >> offset;
1001 opcode("ibfe", 0, tint32
,
1002 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
1004 unsigned offset = src1 & 0x1F;
1005 unsigned bits = src2 & 0x1F;
1008 } else if (offset + bits < 32) {
1009 dst = (base << (32 - bits - offset)) >> (32 - bits);
1011 dst = base >> offset;
1015 # GLSL bitfieldExtract()
1016 opcode("ubitfield_extract", 0, tuint32
,
1017 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
1018 unsigned base = src0;
1019 int offset = src1, bits = src2;
1022 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1023 dst = 0; /* undefined per the spec */
1025 dst = (base >> offset) & ((1ull << bits) - 1);
1028 opcode("ibitfield_extract", 0, tint32
,
1029 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
1031 int offset = src1, bits = src2;
1034 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1037 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1041 # Combines the first component of each input to make a 3-component vector.
1043 triop_horiz("vec3", 3, 1, 1, 1, """
1049 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
1050 src4_size
, const_expr
):
1051 opcode(name
, output_size
, tuint
,
1052 [src1_size
, src2_size
, src3_size
, src4_size
],
1053 [tuint
, tuint
, tuint
, tuint
],
1054 False, "", const_expr
)
1056 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
1057 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
1058 unsigned base = src0, insert = src1;
1059 int offset = src2, bits = src3;
1062 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1065 unsigned mask = ((1ull << bits) - 1) << offset;
1066 dst = (base & ~mask) | ((insert << offset) & mask);
1070 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1077 opcode("vec8", 8, tuint
,
1078 [1] * 8, [tuint
] * 8,
1090 opcode("vec16", 16, tuint
,
1091 [1] * 16, [tuint
] * 16,
1111 # An integer multiply instruction for address calculation. This is
1112 # similar to imul, except that the results are undefined in case of
1113 # overflow. Overflow is defined according to the size of the variable
1114 # being dereferenced.
1116 # This relaxed definition, compared to imul, allows an optimization
1117 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1118 # sources, such that lower precision integer multiplies can be used.
1119 # This is useful on hw that has 24b or perhaps 16b integer multiply
1121 binop("amul", tint
, _2src_commutative
+ associative
, "src0 * src1")
1123 # ir3-specific instruction that maps directly to mul-add shift high mix,
1124 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1125 # multiplication (imul) on Freedreno backend..
1126 opcode("imadsh_mix16", 1, tint32
,
1127 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
1128 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1131 # ir3-specific instruction that maps directly to ir3 mad.s24.
1133 # 24b multiply into 32b result (with sign extension) plus 32b int
1134 triop("imad24_ir3", tint32
, _2src_commutative
,
1135 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1137 # 24b multiply into 32b result (with sign extension)
1138 binop("imul24", tint32
, _2src_commutative
+ associative
,
1139 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")