3325376c81c0e1e7e6d24d43fd1e42ee5ffbcd1d
2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, is_conversion
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
67 assert isinstance(name
, str)
68 assert isinstance(output_size
, int)
69 assert isinstance(output_type
, str)
70 assert isinstance(input_sizes
, list)
71 assert isinstance(input_sizes
[0], int)
72 assert isinstance(input_types
, list)
73 assert isinstance(input_types
[0], str)
74 assert isinstance(is_conversion
, bool)
75 assert isinstance(algebraic_properties
, str)
76 assert isinstance(const_expr
, str)
77 assert len(input_sizes
) == len(input_types
)
78 assert 0 <= output_size
<= 4
79 for size
in input_sizes
:
84 self
.num_inputs
= len(input_sizes
)
85 self
.output_size
= output_size
86 self
.output_type
= output_type
87 self
.input_sizes
= input_sizes
88 self
.input_types
= input_types
89 self
.is_conversion
= is_conversion
90 self
.algebraic_properties
= algebraic_properties
91 self
.const_expr
= const_expr
93 # helper variables for strings
108 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
110 def type_has_size(type_
):
111 m
= _TYPE_SPLIT_RE
.match(type_
)
112 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
113 return m
.group('bits') is not None
115 def type_size(type_
):
116 m
= _TYPE_SPLIT_RE
.match(type_
)
117 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
118 assert m
.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_
)
120 return int(m
.group('bits'))
122 def type_sizes(type_
):
123 if type_has_size(type_
):
124 return [type_size(type_
)]
125 elif type_
== 'bool':
127 elif type_
== 'float':
130 return [1, 8, 16, 32, 64]
132 def type_base_type(type_
):
133 m
= _TYPE_SPLIT_RE
.match(type_
)
134 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
135 return m
.group('type')
137 # Operation where the first two sources are commutative.
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
142 _2src_commutative
= "2src_commutative "
143 associative
= "associative "
145 # global dictionary of opcodes
148 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
149 is_conversion
, algebraic_properties
, const_expr
):
150 assert name
not in opcodes
151 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
152 input_types
, is_conversion
, algebraic_properties
,
155 def unop_convert(name
, out_type
, in_type
, const_expr
):
156 opcode(name
, 0, out_type
, [0], [in_type
], False, "", const_expr
)
158 def unop(name
, ty
, const_expr
):
159 opcode(name
, 0, ty
, [0], [ty
], False, "", const_expr
)
161 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
163 opcode(name
, output_size
, output_type
, [input_size
], [input_type
],
164 False, "", const_expr
)
166 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
167 reduce_expr
, final_expr
):
169 return "(" + prereduce_expr
.format(src
=src
) + ")"
171 return final_expr
.format(src
="(" + src
+ ")")
172 def reduce_(src0
, src1
):
173 return reduce_expr
.format(src0
=src0
, src1
=src1
)
174 src0
= prereduce("src0.x")
175 src1
= prereduce("src0.y")
176 src2
= prereduce("src0.z")
177 src3
= prereduce("src0.w")
178 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
179 final(reduce_(src0
, src1
)))
180 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
181 final(reduce_(reduce_(src0
, src1
), src2
)))
182 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
183 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
185 def unop_numeric_convert(name
, out_type
, in_type
, const_expr
):
186 opcode(name
, 0, out_type
, [0], [in_type
], True, "", const_expr
)
188 unop("mov", tuint
, "src0")
190 unop("ineg", tint
, "-src0")
191 unop("fneg", tfloat
, "-src0")
192 unop("inot", tint
, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat
, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat
, "fabs(src0)")
199 unop("fsat", tfloat
, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat
, "exp2f(src0)")
206 unop("flog2", tfloat
, "log2f(src0)")
208 # Generate all of the numeric conversion opcodes
209 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
211 dst_types
= [tfloat
, tint
]
213 dst_types
= [tfloat
, tint
, tbool
]
215 dst_types
= [tfloat
, tuint
]
216 elif src_t
== tfloat
:
217 dst_types
= [tint
, tuint
, tfloat
, tbool
]
219 for dst_t
in dst_types
:
220 for bit_size
in type_sizes(dst_t
):
221 if bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
222 rnd_modes
= ['_rtne', '_rtz', '']
223 for rnd_mode
in rnd_modes
:
224 if rnd_mode
== '_rtne':
227 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
232 elif rnd_mode
== '_rtz':
235 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
243 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t
[0], dst_t
[0],
245 dst_t
+ str(bit_size
), src_t
, conv_expr
)
246 elif bit_size
== 32 and dst_t
== tfloat
and src_t
== tfloat
:
248 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
249 dst = _mesa_double_to_float_rtz(src0);
254 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
255 dst_t
+ str(bit_size
), src_t
, conv_expr
)
257 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
258 unop_numeric_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
259 dst_t
+ str(bit_size
), src_t
, conv_expr
)
262 # Unary floating-point rounding operations.
265 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
266 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
267 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
268 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
269 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
271 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
273 # Trigonometric operations.
276 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
277 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
280 unop_convert("frexp_exp", tint32
, tfloat
, "frexp(src0, &dst);")
281 unop_convert("frexp_sig", tfloat
, tfloat
, "int n; dst = frexp(src0, &n);")
283 # Partial derivatives.
286 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
287 unop("fddy", tfloat
, "0.0")
288 unop("fddx_fine", tfloat
, "0.0")
289 unop("fddy_fine", tfloat
, "0.0")
290 unop("fddx_coarse", tfloat
, "0.0")
291 unop("fddy_coarse", tfloat
, "0.0")
294 # Floating point pack and unpack operations.
297 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
298 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
299 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
300 """.replace("fmt", fmt
))
303 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
304 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
305 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
306 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
307 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
308 """.replace("fmt", fmt
))
310 def unpack_2x16(fmt
):
311 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
312 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
313 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
314 """.replace("fmt", fmt
))
317 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
318 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
319 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
320 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
321 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
322 """.replace("fmt", fmt
))
336 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
337 dst.x = (src0.x & 0xffff) | (src0.y << 16);
340 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
341 dst.x = (src0.x << 0) |
347 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
348 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
350 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
351 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
353 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
354 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
356 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
357 "dst.x = src0.x; dst.y = src0.x >> 32;")
359 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
360 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
362 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
363 "dst.x = src0.x; dst.y = src0.x >> 16;")
365 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32
, 1, tuint32
, """
366 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
367 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
370 # Lowered floating point unpacking operations.
372 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
373 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
374 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
375 "unpack_half_1x16((uint16_t)(src0 >> 16))")
377 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32
, tuint32
,
378 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
379 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32
, tuint32
,
380 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
382 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
383 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
385 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
386 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
388 # Bit operations, part of ARB_gpu_shader5.
391 unop("bitfield_reverse", tuint32
, """
392 /* we're not winning any awards for speed here, but that's ok */
394 for (unsigned bit = 0; bit < 32; bit++)
395 dst |= ((src0 >> bit) & 1) << (31 - bit);
397 unop_convert("bit_count", tuint32
, tuint
, """
399 for (unsigned bit = 0; bit < bit_size; bit++) {
400 if ((src0 >> bit) & 1)
405 unop_convert("ufind_msb", tint32
, tuint
, """
407 for (int bit = bit_size - 1; bit >= 0; bit--) {
408 if ((src0 >> bit) & 1) {
415 unop("ifind_msb", tint32
, """
417 for (int bit = 31; bit >= 0; bit--) {
418 /* If src0 < 0, we're looking for the first 0 bit.
419 * if src0 >= 0, we're looking for the first 1 bit.
421 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
422 (!((src0 >> bit) & 1) && (src0 < 0))) {
429 unop_convert("find_lsb", tint32
, tint
, """
431 for (unsigned bit = 0; bit < bit_size; bit++) {
432 if ((src0 >> bit) & 1) {
440 for i
in range(1, 5):
441 for j
in range(1, 5):
442 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
445 # AMD_gcn_shader extended instructions
446 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
448 float absX = fabs(src0.x);
449 float absY = fabs(src0.y);
450 float absZ = fabs(src0.z);
453 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
454 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
455 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
457 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
458 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
459 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
460 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
461 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
462 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
464 dst.x = dst.x / ma + 0.5;
465 dst.y = dst.y / ma + 0.5;
468 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
469 float absX = fabs(src0.x);
470 float absY = fabs(src0.y);
471 float absZ = fabs(src0.z);
472 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
473 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
474 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
475 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
476 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
477 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
480 # Sum of vector components
481 unop_reduce("fsum", 1, tfloat
, tfloat
, "{src}", "{src0} + {src1}", "{src}")
483 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
484 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
],
485 False, alg_props
, const_expr
)
487 def binop(name
, ty
, alg_props
, const_expr
):
488 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
490 def binop_compare(name
, ty
, alg_props
, const_expr
):
491 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
493 def binop_compare32(name
, ty
, alg_props
, const_expr
):
494 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
496 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
497 src2_type
, const_expr
):
498 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
499 False, "", const_expr
)
501 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
502 reduce_expr
, final_expr
):
504 return final_expr
.format(src
= "(" + src
+ ")")
505 def reduce_(src0
, src1
):
506 return reduce_expr
.format(src0
=src0
, src1
=src1
)
507 def prereduce(src0
, src1
):
508 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
509 src0
= prereduce("src0.x", "src1.x")
510 src1
= prereduce("src0.y", "src1.y")
511 src2
= prereduce("src0.z", "src1.z")
512 src3
= prereduce("src0.w", "src1.w")
513 opcode(name
+ "2", output_size
, output_type
,
514 [2, 2], [src_type
, src_type
], False, _2src_commutative
,
515 final(reduce_(src0
, src1
)))
516 opcode(name
+ "3", output_size
, output_type
,
517 [3, 3], [src_type
, src_type
], False, _2src_commutative
,
518 final(reduce_(reduce_(src0
, src1
), src2
)))
519 opcode(name
+ "4", output_size
, output_type
,
520 [4, 4], [src_type
, src_type
], False, _2src_commutative
,
521 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
523 binop("fadd", tfloat
, _2src_commutative
+ associative
,"""
524 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
526 dst = _mesa_double_add_rtz(src0, src1);
528 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
533 binop("iadd", tint
, _2src_commutative
+ associative
, "src0 + src1")
534 binop("iadd_sat", tint
, _2src_commutative
, """
536 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
537 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
539 binop("uadd_sat", tuint
, _2src_commutative
,
540 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
541 binop("isub_sat", tint
, "", """
543 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
544 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
546 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
548 binop("fsub", tfloat
, "", """
549 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
551 dst = _mesa_double_sub_rtz(src0, src1);
553 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
558 binop("isub", tint
, "", "src0 - src1")
560 binop("fmul", tfloat
, _2src_commutative
+ associative
, """
561 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
563 dst = _mesa_double_mul_rtz(src0, src1);
565 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
570 # low 32-bits of signed/unsigned integer multiply
571 binop("imul", tint
, _2src_commutative
+ associative
, "src0 * src1")
573 # Generate 64 bit result from 2 32 bits quantity
574 binop_convert("imul_2x32_64", tint64
, tint32
, _2src_commutative
,
575 "(int64_t)src0 * (int64_t)src1")
576 binop_convert("umul_2x32_64", tuint64
, tuint32
, _2src_commutative
,
577 "(uint64_t)src0 * (uint64_t)src1")
579 # high 32-bits of signed integer multiply
580 binop("imul_high", tint
, _2src_commutative
, """
581 if (bit_size == 64) {
582 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
583 * extension to work properly. The casts are kind-of annoying but needed
584 * to prevent compiler warnings.
586 uint32_t src0_u32[4] = {
592 uint32_t src1_u32[4] = {
598 uint32_t prod_u32[4];
599 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
600 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
602 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
606 # high 32-bits of unsigned integer multiply
607 binop("umul_high", tuint
, _2src_commutative
, """
608 if (bit_size == 64) {
609 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
610 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
611 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
612 uint32_t prod_u32[4];
613 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
614 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
616 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
620 # low 32-bits of unsigned integer multiply
621 binop("umul_low", tuint32
, _2src_commutative
, """
622 uint64_t mask = (1 << (bit_size / 2)) - 1;
623 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
627 binop("fdiv", tfloat
, "", "src0 / src1")
628 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
629 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
631 # returns a boolean representing the carry resulting from the addition of
632 # the two unsigned arguments.
634 binop_convert("uadd_carry", tuint
, tuint
, _2src_commutative
, "src0 + src1 < src0")
636 # returns a boolean representing the borrow resulting from the subtraction
637 # of the two unsigned arguments.
639 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
641 # hadd: (a + b) >> 1 (without overflow)
642 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
643 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
644 # = 2 * (x & y) + (x & ~y) + (~x & y)
645 # = ((x & y) << 1) + (x ^ y)
647 # Since we know that the bottom bit of (x & y) << 1 is zero,
649 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
650 # = (x & y) + ((x ^ y) >> 1)
651 binop("ihadd", tint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
652 binop("uhadd", tuint
, _2src_commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
654 # rhadd: (a + b + 1) >> 1 (without overflow)
655 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
656 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
657 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
658 # = ((x | y) << 1) - (x ^ y) + 1
660 # Since we know that the bottom bit of (x & y) << 1 is zero,
662 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
663 # = (x | y) - ((x ^ y) >> 1)
664 binop("irhadd", tint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
665 binop("urhadd", tuint
, _2src_commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
667 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
669 # For signed integers, there are several different possible definitions of
670 # "modulus" or "remainder". We follow the conventions used by LLVM and
671 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
672 # operation while the imod opcode implements the more mathematical
673 # "modulus" operation. For details on the difference, see
675 # http://mathforum.org/library/drmath/view/52343.html
677 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
678 binop("imod", tint
, "",
679 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
680 " src0 % src1 : src0 % src1 + src1)")
681 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
682 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
689 # these integer-aware comparisons return a boolean (0 or ~0)
691 binop_compare("flt", tfloat
, "", "src0 < src1")
692 binop_compare("fge", tfloat
, "", "src0 >= src1")
693 binop_compare("feq", tfloat
, _2src_commutative
, "src0 == src1")
694 binop_compare("fne", tfloat
, _2src_commutative
, "src0 != src1")
695 binop_compare("ilt", tint
, "", "src0 < src1")
696 binop_compare("ige", tint
, "", "src0 >= src1")
697 binop_compare("ieq", tint
, _2src_commutative
, "src0 == src1")
698 binop_compare("ine", tint
, _2src_commutative
, "src0 != src1")
699 binop_compare("ult", tuint
, "", "src0 < src1")
700 binop_compare("uge", tuint
, "", "src0 >= src1")
701 binop_compare32("flt32", tfloat
, "", "src0 < src1")
702 binop_compare32("fge32", tfloat
, "", "src0 >= src1")
703 binop_compare32("feq32", tfloat
, _2src_commutative
, "src0 == src1")
704 binop_compare32("fne32", tfloat
, _2src_commutative
, "src0 != src1")
705 binop_compare32("ilt32", tint
, "", "src0 < src1")
706 binop_compare32("ige32", tint
, "", "src0 >= src1")
707 binop_compare32("ieq32", tint
, _2src_commutative
, "src0 == src1")
708 binop_compare32("ine32", tint
, _2src_commutative
, "src0 != src1")
709 binop_compare32("ult32", tuint
, "", "src0 < src1")
710 binop_compare32("uge32", tuint
, "", "src0 >= src1")
712 # integer-aware GLSL-style comparisons that compare floats and ints
714 binop_reduce("ball_fequal", 1, tbool1
, tfloat
, "{src0} == {src1}",
715 "{src0} && {src1}", "{src}")
716 binop_reduce("bany_fnequal", 1, tbool1
, tfloat
, "{src0} != {src1}",
717 "{src0} || {src1}", "{src}")
718 binop_reduce("ball_iequal", 1, tbool1
, tint
, "{src0} == {src1}",
719 "{src0} && {src1}", "{src}")
720 binop_reduce("bany_inequal", 1, tbool1
, tint
, "{src0} != {src1}",
721 "{src0} || {src1}", "{src}")
723 binop_reduce("b32all_fequal", 1, tbool32
, tfloat
, "{src0} == {src1}",
724 "{src0} && {src1}", "{src}")
725 binop_reduce("b32any_fnequal", 1, tbool32
, tfloat
, "{src0} != {src1}",
726 "{src0} || {src1}", "{src}")
727 binop_reduce("b32all_iequal", 1, tbool32
, tint
, "{src0} == {src1}",
728 "{src0} && {src1}", "{src}")
729 binop_reduce("b32any_inequal", 1, tbool32
, tint
, "{src0} != {src1}",
730 "{src0} || {src1}", "{src}")
732 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
734 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
735 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
736 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
737 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
739 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
740 # and false respectively
742 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
743 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
744 binop("seq", tfloat32
, _2src_commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
745 binop("sne", tfloat32
, _2src_commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
747 # SPIRV shifts are undefined for shift-operands >= bitsize,
748 # but SM5 shifts are defined to use the least significant bits, only
749 # The NIR definition is according to the SM5 specification.
750 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
751 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
752 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], False, "",
753 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
754 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "",
755 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
757 opcode("urol", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
758 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
759 dst = (src0 << (src1 & rotate_mask)) |
760 (src0 >> (-src1 & rotate_mask));
762 opcode("uror", 0, tuint
, [0, 0], [tuint
, tuint32
], False, "", """
763 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
764 dst = (src0 >> (src1 & rotate_mask)) |
765 (src0 << (-src1 & rotate_mask));
768 # bitwise logic operators
770 # These are also used as boolean and, or, xor for hardware supporting
774 binop("iand", tuint
, _2src_commutative
+ associative
, "src0 & src1")
775 binop("ior", tuint
, _2src_commutative
+ associative
, "src0 | src1")
776 binop("ixor", tuint
, _2src_commutative
+ associative
, "src0 ^ src1")
779 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
782 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
783 "{src0} * {src1}", "{src0} + {src1}", "{src}")
785 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
786 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
787 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], False, "",
788 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
790 binop("fmin", tfloat
, "", "fminf(src0, src1)")
791 binop("imin", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
792 binop("umin", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src0 : src1")
793 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
794 binop("imax", tint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
795 binop("umax", tuint
, _2src_commutative
+ associative
, "src1 > src0 ? src1 : src0")
797 # Saturated vector add for 4 8bit ints.
798 binop("usadd_4x8", tint32
, _2src_commutative
+ associative
, """
800 for (int i = 0; i < 32; i += 8) {
801 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
805 # Saturated vector subtract for 4 8bit ints.
806 binop("ussub_4x8", tint32
, "", """
808 for (int i = 0; i < 32; i += 8) {
809 int src0_chan = (src0 >> i) & 0xff;
810 int src1_chan = (src1 >> i) & 0xff;
811 if (src0_chan > src1_chan)
812 dst |= (src0_chan - src1_chan) << i;
816 # vector min for 4 8bit ints.
817 binop("umin_4x8", tint32
, _2src_commutative
+ associative
, """
819 for (int i = 0; i < 32; i += 8) {
820 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
824 # vector max for 4 8bit ints.
825 binop("umax_4x8", tint32
, _2src_commutative
+ associative
, """
827 for (int i = 0; i < 32; i += 8) {
828 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
832 # unorm multiply: (a * b) / 255.
833 binop("umul_unorm_4x8", tint32
, _2src_commutative
+ associative
, """
835 for (int i = 0; i < 32; i += 8) {
836 int src0_chan = (src0 >> i) & 0xff;
837 int src1_chan = (src1 >> i) & 0xff;
838 dst |= ((src0_chan * src1_chan) / 255) << i;
842 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
844 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
845 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
847 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
848 "src0 | ((uint64_t)src1 << 32)")
850 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
851 "src0 | ((uint32_t)src1 << 16)")
853 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
854 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
855 # are from the low five bits of src0 and src1, respectively.
856 binop_convert("bfm", tuint32
, tint32
, "", """
857 int bits = src0 & 0x1F;
858 int offset = src1 & 0x1F;
859 dst = ((1u << bits) - 1) << offset;
862 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], False, "", """
863 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
864 /* flush denormals to zero. */
866 dst = copysignf(0.0f, src0);
869 # Combines the first component of each input to make a 2-component vector.
871 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
877 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
878 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
881 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
882 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
885 def triop(name
, ty
, alg_props
, const_expr
):
886 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], False, alg_props
, const_expr
)
887 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
888 opcode(name
, output_size
, tuint
,
889 [src1_size
, src2_size
, src3_size
],
890 [tuint
, tuint
, tuint
], False, "", const_expr
)
892 triop("ffma", tfloat
, _2src_commutative
, """
893 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
895 dst = _mesa_double_fma_rtz(src0, src1, src2);
896 else if (bit_size == 32)
897 dst = _mesa_float_fma_rtz(src0, src1, src2);
899 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
902 dst = fmaf(src0, src1, src2);
904 dst = fma(src0, src1, src2);
908 triop("flrp", tfloat
, "", "src0 * (1 - src2) + src1 * src2")
912 # A vector conditional select instruction (like ?:, but operating per-
913 # component on vectors). There are two versions, one for floating point
914 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
917 triop("fcsel", tfloat32
, "", "(src0 != 0.0f) ? src1 : src2")
920 triop("fmin3", tfloat
, "", "fminf(src0, fminf(src1, src2))")
921 triop("imin3", tint
, "", "MIN2(src0, MIN2(src1, src2))")
922 triop("umin3", tuint
, "", "MIN2(src0, MIN2(src1, src2))")
924 triop("fmax3", tfloat
, "", "fmaxf(src0, fmaxf(src1, src2))")
925 triop("imax3", tint
, "", "MAX2(src0, MAX2(src1, src2))")
926 triop("umax3", tuint
, "", "MAX2(src0, MAX2(src1, src2))")
928 triop("fmed3", tfloat
, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
929 triop("imed3", tint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
930 triop("umed3", tuint
, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
932 opcode("bcsel", 0, tuint
, [0, 0, 0],
933 [tbool1
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
934 opcode("b32csel", 0, tuint
, [0, 0, 0],
935 [tbool32
, tuint
, tuint
], False, "", "src0 ? src1 : src2")
938 triop("bfi", tuint32
, "", """
939 unsigned mask = src0, insert = src1, base = src2;
948 dst = (base & ~mask) | (insert & mask);
953 triop("bitfield_select", tuint
, "", "(src0 & src1) | (~src0 & src2)")
955 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
956 opcode("ubfe", 0, tuint32
,
957 [0, 0, 0], [tuint32
, tuint32
, tuint32
], False, "", """
958 unsigned base = src0;
959 unsigned offset = src1 & 0x1F;
960 unsigned bits = src2 & 0x1F;
963 } else if (offset + bits < 32) {
964 dst = (base << (32 - bits - offset)) >> (32 - bits);
966 dst = base >> offset;
969 opcode("ibfe", 0, tint32
,
970 [0, 0, 0], [tint32
, tuint32
, tuint32
], False, "", """
972 unsigned offset = src1 & 0x1F;
973 unsigned bits = src2 & 0x1F;
976 } else if (offset + bits < 32) {
977 dst = (base << (32 - bits - offset)) >> (32 - bits);
979 dst = base >> offset;
983 # GLSL bitfieldExtract()
984 opcode("ubitfield_extract", 0, tuint32
,
985 [0, 0, 0], [tuint32
, tint32
, tint32
], False, "", """
986 unsigned base = src0;
987 int offset = src1, bits = src2;
990 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
991 dst = 0; /* undefined per the spec */
993 dst = (base >> offset) & ((1ull << bits) - 1);
996 opcode("ibitfield_extract", 0, tint32
,
997 [0, 0, 0], [tint32
, tint32
, tint32
], False, "", """
999 int offset = src1, bits = src2;
1002 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1005 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1009 # Combines the first component of each input to make a 3-component vector.
1011 triop_horiz("vec3", 3, 1, 1, 1, """
1017 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
1018 src4_size
, const_expr
):
1019 opcode(name
, output_size
, tuint
,
1020 [src1_size
, src2_size
, src3_size
, src4_size
],
1021 [tuint
, tuint
, tuint
, tuint
],
1022 False, "", const_expr
)
1024 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
1025 [tuint32
, tuint32
, tint32
, tint32
], False, "", """
1026 unsigned base = src0, insert = src1;
1027 int offset = src2, bits = src3;
1030 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1033 unsigned mask = ((1ull << bits) - 1) << offset;
1034 dst = (base & ~mask) | ((insert << offset) & mask);
1038 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1045 # ir3-specific instruction that maps directly to mul-add shift high mix,
1046 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1047 # multiplication (imul) on Freedreno backend..
1048 opcode("imadsh_mix16", 1, tint32
,
1049 [1, 1, 1], [tint32
, tint32
, tint32
], False, "", """
1050 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;