2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
108 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
110 def type_has_size(type_
):
111 m
= _TYPE_SPLIT_RE
.match(type_
)
112 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
113 return m
.group('bits') is not None
115 def type_size(type_
):
116 m
= _TYPE_SPLIT_RE
.match(type_
)
117 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
118 assert m
.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_
)
120 return int(m
.group('bits'))
122 def type_sizes(type_
):
123 if type_has_size(type_
):
124 return [type_size(type_
)]
125 elif type_
== 'bool':
127 elif type_
== 'float':
130 return [1, 8, 16, 32, 64]
132 def type_base_type(type_
):
133 m
= _TYPE_SPLIT_RE
.match(type_
)
134 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
135 return m
.group('type')
137 # Operation where the first two sources are commutative.
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
142 _2src_commutative
= "2src_commutative "
143 associative
= "associative "
145 # global dictionary of opcodes
148 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
149 is_conversion
, algebraic_properties
, const_expr
):
150 assert name
not in opcodes
151 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
152 input_types
, is_conversion
, algebraic_properties
,
155 def unop_convert(name
, out_type
, in_type
, const_expr
):
156 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
158 def unop(name
, ty
, const_expr
):
159 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
161 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
163 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
164 False, "", const_expr
)
166 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
167 reduce_expr
, final_expr
):
169 return "(" + prereduce_expr
.format(src
=src
) + ")"
171 return final_expr
.format(src
="(" + src
+ ")")
172 def reduce_(src0
, src1
):
173 return reduce_expr
.format(src0
=src0
, src1
=src1
)
174 src0
= prereduce("src0.x")
175 src1
= prereduce("src0.y")
176 src2
= prereduce("src0.z")
177 src3
= prereduce("src0.w")
178 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
179 final(reduce_(src0
, src1
)))
180 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
181 final(reduce_(reduce_(src0
, src1
), src2
)))
182 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
183 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
185 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
186 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
188 unop("mov", tuint
, "src0")
190 unop("ineg", tint
, "-src0")
191 unop("fneg", tfloat
, "-src0")
192 unop("inot", tint
, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat
, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat
, "fabs(src0)")
199 unop("fsat", tfloat
, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat
, "exp2f(src0)")
206 unop("flog2", tfloat
, "log2f(src0)")
208 # Generate all of the numeric conversion opcodes
209 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
211 dst_types
= [tfloat
, tint
]
213 dst_types
= [tfloat
, tint
, tbool
]
215 dst_types
= [tfloat
, tuint
]
216 elif src_t
== tfloat
:
217 dst_types
= [tint
, tuint
, tfloat
, tbool
]
219 for dst_t
in dst_types
:
220 for dst_bit_size
in type_sizes(dst_t
):
221 if dst_bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
222 rnd_modes
= ['_rtne', '_rtz', '']
223 for rnd_mode
in rnd_modes
:
224 if rnd_mode
== '_rtne':
227 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
232 elif rnd_mode
== '_rtz':
235 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
243 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0],
247 dst_t
+ str(dst_bit_size
),
249 elif dst_bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
251 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
252 dst = _mesa_double_to_float_rtz(src0);
257 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
259 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
261 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
262 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0],
264 dst_t
+ str(dst_bit_size
), src_t
, conv_expr
)
267 # Unary floating-point rounding operations.
270 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
271 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
272 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
273 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
274 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
276 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
278 # Trigonometric operations.
281 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
282 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
285 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
286 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
288 # Partial derivatives.
291 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
292 unop("fddy", tfloat
, "0.0")
293 unop("fddx_fine", tfloat
, "0.0")
294 unop("fddy_fine", tfloat
, "0.0")
295 unop("fddx_coarse", tfloat
, "0.0")
296 unop("fddy_coarse", tfloat
, "0.0")
299 # Floating point pack and unpack operations.
302 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
303 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
304 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
305 """.replace("fmt", fmt
))
308 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
309 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
310 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
311 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
313 """.replace("fmt", fmt
))
315 def unpack_2x16(fmt
):
316 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
317 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
318 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
319 """.replace("fmt", fmt
))
322 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
323 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
324 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
325 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
326 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
327 """.replace("fmt", fmt
))
341 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
342 dst.x = (src0.x & 0xffff) | (src0.y << 16);
345 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
346 dst.x = (src0.x << 0) |
352 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
353 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
355 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
356 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
358 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
359 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
361 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
362 "dst.x = src0.x; dst.y = src0.x >> 32;")
364 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
365 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
367 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
368 "dst.x = src0.x; dst.y = src0.x >> 16;")
370 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32
, 1, tuint32
, """
371 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
372 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
375 # Lowered floating point unpacking operations.
377 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
378 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
379 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
380 "unpack_half_1x16((uint16_t)(src0 >> 16))")
382 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32
, tuint32
,
383 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
384 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32
, tuint32
,
385 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
387 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
388 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
390 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
391 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
393 # Bit operations, part of ARB_gpu_shader5.
396 unop("bitfield_reverse", tuint32
, """
397 /* we're not winning any awards for speed here, but that's ok */
399 for (unsigned bit = 0; bit < 32; bit++)
400 dst |= ((src0 >> bit) & 1) << (31 - bit);
402 unop_convert("bit_count", tuint32
, tuint
, """
404 for (unsigned bit = 0; bit < bit_size; bit++) {
405 if ((src0 >> bit) & 1)
410 unop_convert("ufind_msb", tint32
, tuint
, """
412 for (int bit = bit_size - 1; bit >= 0; bit--) {
413 if ((src0 >> bit) & 1) {
420 unop("ifind_msb", tint32
, """
422 for (int bit = 31; bit >= 0; bit--) {
423 /* If src0 < 0, we're looking for the first 0 bit.
424 * if src0 >= 0, we're looking for the first 1 bit.
426 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
427 (!((src0 >> bit) & 1) && (src0 < 0))) {
434 unop_convert("find_lsb", tint32
, tint
, """
436 for (unsigned bit = 0; bit < bit_size; bit++) {
437 if ((src0 >> bit) & 1) {
445 for i
in range(1, 5):
446 for j
in range(1, 5):
447 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
450 # AMD_gcn_shader extended instructions
451 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
453 float absX = fabs(src0.x);
454 float absY = fabs(src0.y);
455 float absZ = fabs(src0.z);
458 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
459 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
460 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
462 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
463 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
464 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
465 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
466 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
467 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
469 dst.x = dst.x / ma + 0.5;
470 dst.y = dst.y / ma + 0.5;
473 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
474 float absX = fabs(src0.x);
475 float absY = fabs(src0.y);
476 float absZ = fabs(src0.z);
477 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
478 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
479 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
480 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
481 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
482 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
485 # Sum of vector components
486 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
488 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
489 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
490 False, alg_props
, const_expr
)
492 def binop(name
, ty
, alg_props
, const_expr
):
493 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
495 def binop_compare(name
, ty
, alg_props
, const_expr
):
496 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
498 def binop_compare32(name
, ty
, alg_props
, const_expr
):
499 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
501 def binop_compare_all_sizes(name
, ty
, alg_props
, const_expr
):
502 binop_compare(name
, ty
, alg_props
, const_expr
)
503 binop_compare32(name
+ "32", ty
, alg_props
, const_expr
)
505 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
506 src2_type
, const_expr
):
507 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
508 False, "", const_expr
)
510 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
511 reduce_expr
, final_expr
):
513 return final_expr
.format(src
= "(" + src
+ ")")
514 def reduce_(src0
, src1
):
515 return reduce_expr
.format(src0
=src0
, src1
=src1
)
516 def prereduce(src0
, src1
):
517 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
518 src0
= prereduce("src0.x", "src1.x")
519 src1
= prereduce("src0.y", "src1.y")
520 src2
= prereduce("src0.z", "src1.z")
521 src3
= prereduce("src0.w", "src1.w")
522 opcode(name
+ "2", output_size
, output_type
,
523 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
524 final(reduce_(src0
, src1
)))
525 opcode(name
+ "3", output_size
, output_type
,
526 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
527 final(reduce_(reduce_(src0
, src1
), src2
)))
528 opcode(name
+ "4", output_size
, output_type
,
529 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
530 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
532 binop("fadd", tfloat
, _2src_commutative
+ associative
,"""
533 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
535 dst = _mesa_double_add_rtz(src0, src1);
537 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
542 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
543 binop("iadd_sat", tint
, _2src_commutative
, """
545 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
546 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
548 binop("uadd_sat", tuint
, _2src_commutative
,
549 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
550 binop("isub_sat", tint
, "", """
552 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
553 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
555 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
557 binop("fsub", tfloat
, "", """
558 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
560 dst = _mesa_double_sub_rtz(src0, src1);
562 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
567 binop("isub", tint
, "", "src0 - src1")
569 binop("fmul", tfloat
, _2src_commutative
+ associative
, """
570 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
572 dst = _mesa_double_mul_rtz(src0, src1);
574 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
579 # low 32-bits of signed/unsigned integer multiply
580 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
582 # Generate 64 bit result from 2 32 bits quantity
583 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
584 "(int64_t)src0 * (int64_t)src1")
585 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
586 "(uint64_t)src0 * (uint64_t)src1")
588 # high 32-bits of signed integer multiply
589 binop("imul_high", tint
, _2src_commutative
, """
590 if (bit_size == 64) {
591 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
592 * extension to work properly. The casts are kind-of annoying but needed
593 * to prevent compiler warnings.
595 uint32_t src0_u32[4] = {
601 uint32_t src1_u32[4] = {
607 uint32_t prod_u32[4];
608 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
609 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
611 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
615 # high 32-bits of unsigned integer multiply
616 binop("umul_high", tuint
, _2src_commutative
, """
617 if (bit_size == 64) {
618 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
619 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
620 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
621 uint32_t prod_u32[4];
622 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
623 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
625 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
629 # low 32-bits of unsigned integer multiply
630 binop("umul_low", tuint32
, _2src_commutative
, """
631 uint64_t mask = (1 << (bit_size / 2)) - 1;
632 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
636 binop("fdiv", tfloat
, "", "src0 / src1")
637 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
638 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
640 # returns a boolean representing the carry resulting from the addition of
641 # the two unsigned arguments.
643 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
645 # returns a boolean representing the borrow resulting from the subtraction
646 # of the two unsigned arguments.
648 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
650 # hadd: (a + b) >> 1 (without overflow)
651 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
652 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
653 # = 2 * (x & y) + (x & ~y) + (~x & y)
654 # = ((x & y) << 1) + (x ^ y)
656 # Since we know that the bottom bit of (x & y) << 1 is zero,
658 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
659 # = (x & y) + ((x ^ y) >> 1)
660 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
661 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
663 # rhadd: (a + b + 1) >> 1 (without overflow)
664 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
665 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
666 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
667 # = ((x | y) << 1) - (x ^ y) + 1
669 # Since we know that the bottom bit of (x & y) << 1 is zero,
671 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
672 # = (x | y) - ((x ^ y) >> 1)
673 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
674 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
676 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
678 # For signed integers, there are several different possible definitions of
679 # "modulus" or "remainder". We follow the conventions used by LLVM and
680 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
681 # operation while the imod opcode implements the more mathematical
682 # "modulus" operation. For details on the difference, see
684 # http://mathforum.org/library/drmath/view/52343.html
686 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
687 binop("imod", tint
, "",
688 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
689 " src0 % src1 : src0 % src1 + src1)")
690 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
691 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
698 # these integer-aware comparisons return a boolean (0 or ~0)
700 binop_compare_all_sizes("flt", tfloat
, "", "src0 < src1")
701 binop_compare_all_sizes("fge", tfloat
, "", "src0 >= src1")
702 binop_compare_all_sizes("feq", tfloat
, _2src_commutative
, "src0 == src1")
703 binop_compare_all_sizes("fne", tfloat
, _2src_commutative
, "src0 != src1")
704 binop_compare_all_sizes("ilt", tint
, "", "src0 < src1")
705 binop_compare_all_sizes("ige", tint
, "", "src0 >= src1")
706 binop_compare_all_sizes("ieq", tint
, _2src_commutative
, "src0 == src1")
707 binop_compare_all_sizes("ine", tint
, _2src_commutative
, "src0 != src1")
708 binop_compare_all_sizes("ult", tuint
, "", "src0 < src1")
709 binop_compare_all_sizes("uge", tuint
, "", "src0 >= src1")
711 # integer-aware GLSL-style comparisons that compare floats and ints
713 binop_reduce("ball_fequal", 1, tbool1
, tfloat
, "{src0} == {src1}",
714 "{src0} && {src1}", "{src}")
715 binop_reduce("bany_fnequal", 1, tbool1
, tfloat
, "{src0} != {src1}",
716 "{src0} || {src1}", "{src}")
717 binop_reduce("ball_iequal", 1, tbool1
, tint
, "{src0} == {src1}",
718 "{src0} && {src1}", "{src}")
719 binop_reduce("bany_inequal", 1, tbool1
, tint
, "{src0} != {src1}",
720 "{src0} || {src1}", "{src}")
722 binop_reduce("b32all_fequal", 1, tbool32
, tfloat
, "{src0} == {src1}",
723 "{src0} && {src1}", "{src}")
724 binop_reduce("b32any_fnequal", 1, tbool32
, tfloat
, "{src0} != {src1}",
725 "{src0} || {src1}", "{src}")
726 binop_reduce("b32all_iequal", 1, tbool32
, tint
, "{src0} == {src1}",
727 "{src0} && {src1}", "{src}")
728 binop_reduce("b32any_inequal", 1, tbool32
, tint
, "{src0} != {src1}",
729 "{src0} || {src1}", "{src}")
731 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
733 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
734 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
735 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
736 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
738 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
739 # and false respectively
741 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
742 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
743 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
744 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
746 # SPIRV shifts are undefined for shift-operands >= bitsize,
747 # but SM5 shifts are defined to use the least significant bits, only
748 # The NIR definition is according to the SM5 specification.
749 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
750 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
751 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
752 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
753 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
754 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
756 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
757 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
758 dst = (src0 << (src1 & rotate_mask)) |
759 (src0 >> (-src1 & rotate_mask));
761 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
762 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
763 dst = (src0 >> (src1 & rotate_mask)) |
764 (src0 << (-src1 & rotate_mask));
767 # bitwise logic operators
769 # These are also used as boolean and, or, xor for hardware supporting
773 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
774 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
775 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
778 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
781 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
782 "{src0} * {src1}", "{src0} + {src1}", "{src}")
784 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
785 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
786 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
787 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
789 binop("fmin", tfloat
, "", "fmin(src0, src1)")
790 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
791 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
792 binop("fmax", tfloat
, "", "fmax(src0, src1)")
793 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
794 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
796 # Saturated vector add for 4 8bit ints.
797 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
799 for (int i = 0; i < 32; i += 8) {
800 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
804 # Saturated vector subtract for 4 8bit ints.
805 binop("ussub_4x8", tint32
, "", """
807 for (int i = 0; i < 32; i += 8) {
808 int src0_chan = (src0 >> i) & 0xff;
809 int src1_chan = (src1 >> i) & 0xff;
810 if (src0_chan > src1_chan)
811 dst |= (src0_chan - src1_chan) << i;
815 # vector min for 4 8bit ints.
816 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
818 for (int i = 0; i < 32; i += 8) {
819 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
823 # vector max for 4 8bit ints.
824 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
826 for (int i = 0; i < 32; i += 8) {
827 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
831 # unorm multiply: (a * b) / 255.
832 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
834 for (int i = 0; i < 32; i += 8) {
835 int src0_chan = (src0 >> i) & 0xff;
836 int src1_chan = (src1 >> i) & 0xff;
837 dst |= ((src0_chan * src1_chan) / 255) << i;
841 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
843 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
844 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
846 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
847 "src0 | ((uint64_t)src1 << 32)")
849 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
850 "src0 | ((uint32_t)src1 << 16)")
852 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
853 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
854 # are from the low five bits of src0 and src1, respectively.
855 binop_convert("bfm", tuint32
, tint32
, "", """
856 int bits = src0 & 0x1F;
857 int offset = src1 & 0x1F;
858 dst = ((1u << bits) - 1) << offset;
861 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
862 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
863 /* flush denormals to zero. */
865 dst = copysignf(0.0f, src0);
868 # Combines the first component of each input to make a 2-component vector.
870 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
876 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
877 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
880 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
881 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
884 def triop(name
, ty
, alg_props
, const_expr
):
885 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
886 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
887 opcode(name
, output_size
, tuint
,
888 [src1_size
, src2_size
, src3_size
],
889 [tuint
, tuint
, tuint
], False, "", const_expr
)
891 triop("ffma", tfloat
, _2src_commutative
, """
892 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
894 dst = _mesa_double_fma_rtz(src0, src1, src2);
895 else if (bit_size == 32)
896 dst = _mesa_float_fma_rtz(src0, src1, src2);
898 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
901 dst = fmaf(src0, src1, src2);
903 dst = fma(src0, src1, src2);
907 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
911 # A vector conditional select instruction (like ?:, but operating per-
912 # component on vectors). There are two versions, one for floating point
913 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
916 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
919 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
920 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
921 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
923 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
924 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
925 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
927 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
928 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
929 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
931 opcode("bcsel", 0, tuint
, [0, 0, 0],
932 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
933 opcode("b32csel", 0, tuint
, [0, 0, 0],
934 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
937 triop("bfi", tuint32
, "", """
938 unsigned mask = src0, insert = src1, base = src2;
947 dst = (base & ~mask) | (insert & mask);
952 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
954 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
955 opcode("ubfe", 0, tuint32
,
956 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
957 unsigned base = src0;
958 unsigned offset = src1 & 0x1F;
959 unsigned bits = src2 & 0x1F;
962 } else if (offset + bits < 32) {
963 dst = (base << (32 - bits - offset)) >> (32 - bits);
965 dst = base >> offset;
968 opcode("ibfe", 0, tint32
,
969 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
971 unsigned offset = src1 & 0x1F;
972 unsigned bits = src2 & 0x1F;
975 } else if (offset + bits < 32) {
976 dst = (base << (32 - bits - offset)) >> (32 - bits);
978 dst = base >> offset;
982 # GLSL bitfieldExtract()
983 opcode("ubitfield_extract", 0, tuint32
,
984 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
985 unsigned base = src0;
986 int offset = src1, bits = src2;
989 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
990 dst = 0; /* undefined per the spec */
992 dst = (base >> offset) & ((1ull << bits) - 1);
995 opcode("ibitfield_extract", 0, tint32
,
996 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
998 int offset = src1, bits = src2;
1001 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1004 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1008 # Combines the first component of each input to make a 3-component vector.
1010 triop_horiz("vec3", 3, 1, 1, 1, """
1016 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
1017 src4_size
, const_expr
):
1018 opcode(name
, output_size
, tuint
,
1019 [src1_size
, src2_size
, src3_size
, src4_size
],
1020 [tuint
, tuint
, tuint
, tuint
],
1021 False, "", const_expr
)
1023 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
1024 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
1025 unsigned base = src0, insert = src1;
1026 int offset = src2, bits = src3;
1029 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1032 unsigned mask = ((1ull << bits) - 1) << offset;
1033 dst = (base & ~mask) | ((insert << offset) & mask);
1037 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1044 # An integer multiply instruction for address calculation. This is
1045 # similar to imul, except that the results are undefined in case of
1046 # overflow. Overflow is defined according to the size of the variable
1047 # being dereferenced.
1049 # This relaxed definition, compared to imul, allows an optimization
1050 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1051 # sources, such that lower precision integer multiplies can be used.
1052 # This is useful on hw that has 24b or perhaps 16b integer multiply
1054 binop("amul", tint
, _2src_commutative
+ associative
, "src0 * src1")
1056 # ir3-specific instruction that maps directly to mul-add shift high mix,
1057 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1058 # multiplication (imul) on Freedreno backend..
1059 opcode("imadsh_mix16", 1, tint32
,
1060 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
1061 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1064 # ir3-specific instruction that maps directly to ir3 mad.s24.
1066 # 24b multiply into 32b result (with sign extension) plus 32b int
1067 triop("imad24_ir3", tint32
, _2src_commutative
,
1068 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1070 # 24b multiply into 32b result (with sign extension)
1071 binop("imul24", tint32
, _2src_commutative
+ associative
,
1072 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")