2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
66 assert isinstance(name
, str)
67 assert isinstance(output_size
, int)
68 assert isinstance(output_type
, str)
69 assert isinstance(input_sizes
, list)
70 assert isinstance(input_sizes
[0], int)
71 assert isinstance(input_types
, list)
72 assert isinstance(input_types
[0], str)
73 assert isinstance(algebraic_properties
, str)
74 assert isinstance(const_expr
, str)
75 assert len(input_sizes
) == len(input_types
)
76 assert 0 <= output_size
<= 4
77 for size
in input_sizes
:
82 self
.num_inputs
= len(input_sizes
)
83 self
.output_size
= output_size
84 self
.output_type
= output_type
85 self
.input_sizes
= input_sizes
86 self
.input_types
= input_types
87 self
.algebraic_properties
= algebraic_properties
88 self
.const_expr
= const_expr
90 # helper variables for strings
104 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
106 def type_has_size(type_
):
107 m
= _TYPE_SPLIT_RE
.match(type_
)
108 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
109 return m
.group('bits') is not None
111 def type_size(type_
):
112 m
= _TYPE_SPLIT_RE
.match(type_
)
113 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
114 assert m
.group('bits') is not None, \
115 'NIR type string has no bit size: "{}"'.format(type_
)
116 return int(m
.group('bits'))
118 def type_sizes(type_
):
119 if type_has_size(type_
):
120 return [type_size(type_
)]
121 elif type_
== 'bool':
123 elif type_
== 'float':
126 return [8, 16, 32, 64]
128 def type_base_type(type_
):
129 m
= _TYPE_SPLIT_RE
.match(type_
)
130 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
131 return m
.group('type')
133 commutative
= "commutative "
134 associative
= "associative "
136 # global dictionary of opcodes
139 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
140 algebraic_properties
, const_expr
):
141 assert name
not in opcodes
142 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
143 input_types
, algebraic_properties
, const_expr
)
145 def unop_convert(name
, out_type
, in_type
, const_expr
):
146 opcode(name
, 0, out_type
, [0], [in_type
], "", const_expr
)
148 def unop(name
, ty
, const_expr
):
149 opcode(name
, 0, ty
, [0], [ty
], "", const_expr
)
151 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
153 opcode(name
, output_size
, output_type
, [input_size
], [input_type
], "",
156 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
157 reduce_expr
, final_expr
):
159 return "(" + prereduce_expr
.format(src
=src
) + ")"
161 return final_expr
.format(src
="(" + src
+ ")")
162 def reduce_(src0
, src1
):
163 return reduce_expr
.format(src0
=src0
, src1
=src1
)
164 src0
= prereduce("src0.x")
165 src1
= prereduce("src0.y")
166 src2
= prereduce("src0.z")
167 src3
= prereduce("src0.w")
168 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
169 final(reduce_(src0
, src1
)))
170 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
171 final(reduce_(reduce_(src0
, src1
), src2
)))
172 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
173 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
176 # These two move instructions differ in what modifiers they support and what
177 # the negate modifier means. Otherwise, they are identical.
178 unop("fmov", tfloat
, "src0")
179 unop("imov", tint
, "src0")
181 unop("ineg", tint
, "-src0")
182 unop("fneg", tfloat
, "-src0")
183 unop("inot", tint
, "~src0") # invert every bit of the integer
184 unop("fnot", tfloat
, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
185 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
186 unop("fsign", tfloat
, ("bit_size == 64 ? " +
187 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
188 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
189 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
190 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
191 unop("fabs", tfloat
, "fabs(src0)")
192 unop("fsat", tfloat
, ("bit_size == 64 ? " +
193 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
194 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
195 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
196 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
197 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
198 unop("fexp2", tfloat
, "exp2f(src0)")
199 unop("flog2", tfloat
, "log2f(src0)")
201 # Generate all of the numeric conversion opcodes
202 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
204 dst_types
= [tfloat
, tint
]
206 dst_types
= [tfloat
, tint
, tbool
]
208 dst_types
= [tfloat
, tuint
]
209 elif src_t
== tfloat
:
210 dst_types
= [tint
, tuint
, tfloat
, tbool
]
212 for dst_t
in dst_types
:
213 for bit_size
in type_sizes(dst_t
):
214 if bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
215 rnd_modes
= ['_rtne', '_rtz', '']
216 for rnd_mode
in rnd_modes
:
217 unop_convert("{0}2{1}{2}{3}".format(src_t
[0], dst_t
[0],
219 dst_t
+ str(bit_size
), src_t
, "src0")
221 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
222 unop_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
223 dst_t
+ str(bit_size
), src_t
, conv_expr
)
226 # Unary floating-point rounding operations.
229 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
230 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
231 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
232 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
233 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
235 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
237 # Trigonometric operations.
240 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
241 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
244 unop_convert("frexp_exp", tint32
, tfloat64
, "frexp(src0, &dst);")
245 unop_convert("frexp_sig", tfloat64
, tfloat64
, "int n; dst = frexp(src0, &n);")
247 # Partial derivatives.
250 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
251 unop("fddy", tfloat
, "0.0")
252 unop("fddx_fine", tfloat
, "0.0")
253 unop("fddy_fine", tfloat
, "0.0")
254 unop("fddx_coarse", tfloat
, "0.0")
255 unop("fddy_coarse", tfloat
, "0.0")
258 # Floating point pack and unpack operations.
261 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
262 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
263 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
264 """.replace("fmt", fmt
))
267 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
268 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
269 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
270 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
271 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
272 """.replace("fmt", fmt
))
274 def unpack_2x16(fmt
):
275 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
276 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
277 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
278 """.replace("fmt", fmt
))
281 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
282 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
283 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
284 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
285 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
286 """.replace("fmt", fmt
))
300 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
301 dst.x = (src0.x & 0xffff) | (src0.y << 16);
304 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
305 dst.x = (src0.x << 0) |
311 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
312 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
314 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
315 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
317 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
318 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
320 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
321 "dst.x = src0.x; dst.y = src0.x >> 32;")
323 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
324 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
326 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
327 "dst.x = src0.x; dst.y = src0.x >> 16;")
329 # Lowered floating point unpacking operations.
332 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
333 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
334 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
335 "unpack_half_1x16((uint16_t)(src0 >> 16))")
337 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
338 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
340 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
341 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
343 # Bit operations, part of ARB_gpu_shader5.
346 unop("bitfield_reverse", tuint32
, """
347 /* we're not winning any awards for speed here, but that's ok */
349 for (unsigned bit = 0; bit < 32; bit++)
350 dst |= ((src0 >> bit) & 1) << (31 - bit);
352 unop_convert("bit_count", tuint32
, tuint
, """
354 for (unsigned bit = 0; bit < bit_size; bit++) {
355 if ((src0 >> bit) & 1)
360 unop_convert("ufind_msb", tint32
, tuint
, """
362 for (int bit = bit_size - 1; bit >= 0; bit--) {
363 if ((src0 >> bit) & 1) {
370 unop("ifind_msb", tint32
, """
372 for (int bit = 31; bit >= 0; bit--) {
373 /* If src0 < 0, we're looking for the first 0 bit.
374 * if src0 >= 0, we're looking for the first 1 bit.
376 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
377 (!((src0 >> bit) & 1) && (src0 < 0))) {
384 unop_convert("find_lsb", tint32
, tint
, """
386 for (unsigned bit = 0; bit < bit_size; bit++) {
387 if ((src0 >> bit) & 1) {
395 for i
in range(1, 5):
396 for j
in range(1, 5):
397 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
400 # AMD_gcn_shader extended instructions
401 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
403 float absX = fabs(src0.x);
404 float absY = fabs(src0.y);
405 float absZ = fabs(src0.z);
406 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
407 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
408 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
409 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
410 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
411 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
414 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
415 float absX = fabs(src0.x);
416 float absY = fabs(src0.y);
417 float absZ = fabs(src0.z);
418 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
419 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
420 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
421 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
422 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
423 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
427 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
428 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
], alg_props
, const_expr
)
430 def binop(name
, ty
, alg_props
, const_expr
):
431 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
433 def binop_compare(name
, ty
, alg_props
, const_expr
):
434 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
436 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
437 src2_type
, const_expr
):
438 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
441 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
442 reduce_expr
, final_expr
):
444 return final_expr
.format(src
= "(" + src
+ ")")
445 def reduce_(src0
, src1
):
446 return reduce_expr
.format(src0
=src0
, src1
=src1
)
447 def prereduce(src0
, src1
):
448 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
449 src0
= prereduce("src0.x", "src1.x")
450 src1
= prereduce("src0.y", "src1.y")
451 src2
= prereduce("src0.z", "src1.z")
452 src3
= prereduce("src0.w", "src1.w")
453 opcode(name
+ "2", output_size
, output_type
,
454 [2, 2], [src_type
, src_type
], commutative
,
455 final(reduce_(src0
, src1
)))
456 opcode(name
+ "3", output_size
, output_type
,
457 [3, 3], [src_type
, src_type
], commutative
,
458 final(reduce_(reduce_(src0
, src1
), src2
)))
459 opcode(name
+ "4", output_size
, output_type
,
460 [4, 4], [src_type
, src_type
], commutative
,
461 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
463 binop("fadd", tfloat
, commutative
+ associative
, "src0 + src1")
464 binop("iadd", tint
, commutative
+ associative
, "src0 + src1")
465 binop("fsub", tfloat
, "", "src0 - src1")
466 binop("isub", tint
, "", "src0 - src1")
468 binop("fmul", tfloat
, commutative
+ associative
, "src0 * src1")
469 # low 32-bits of signed/unsigned integer multiply
470 binop("imul", tint
, commutative
+ associative
, "src0 * src1")
472 # high 32-bits of signed integer multiply
473 binop("imul_high", tint
, commutative
, """
474 if (bit_size == 64) {
475 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
476 * extension to work properly. The casts are kind-of annoying but needed
477 * to prevent compiler warnings.
479 uint32_t src0_u32[4] = {
485 uint32_t src1_u32[4] = {
491 uint32_t prod_u32[4];
492 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
493 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
495 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
499 # high 32-bits of unsigned integer multiply
500 binop("umul_high", tuint
, commutative
, """
501 if (bit_size == 64) {
502 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
503 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
504 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
505 uint32_t prod_u32[4];
506 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
507 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
509 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
513 binop("fdiv", tfloat
, "", "src0 / src1")
514 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
515 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
517 # returns a boolean representing the carry resulting from the addition of
518 # the two unsigned arguments.
520 binop_convert("uadd_carry", tuint
, tuint
, commutative
, "src0 + src1 < src0")
522 # returns a boolean representing the borrow resulting from the subtraction
523 # of the two unsigned arguments.
525 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
527 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
529 # For signed integers, there are several different possible definitions of
530 # "modulus" or "remainder". We follow the conventions used by LLVM and
531 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
532 # operation while the imod opcode implements the more mathematical
533 # "modulus" operation. For details on the difference, see
535 # http://mathforum.org/library/drmath/view/52343.html
537 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
538 binop("imod", tint
, "",
539 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
540 " src0 % src1 : src0 % src1 + src1)")
541 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
542 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
549 # these integer-aware comparisons return a boolean (0 or ~0)
551 binop_compare("flt", tfloat
, "", "src0 < src1")
552 binop_compare("fge", tfloat
, "", "src0 >= src1")
553 binop_compare("feq", tfloat
, commutative
, "src0 == src1")
554 binop_compare("fne", tfloat
, commutative
, "src0 != src1")
555 binop_compare("ilt", tint
, "", "src0 < src1")
556 binop_compare("ige", tint
, "", "src0 >= src1")
557 binop_compare("ieq", tint
, commutative
, "src0 == src1")
558 binop_compare("ine", tint
, commutative
, "src0 != src1")
559 binop_compare("ult", tuint
, "", "src0 < src1")
560 binop_compare("uge", tuint
, "", "src0 >= src1")
562 # integer-aware GLSL-style comparisons that compare floats and ints
564 binop_reduce("ball_fequal", 1, tbool32
, tfloat
, "{src0} == {src1}",
565 "{src0} && {src1}", "{src}")
566 binop_reduce("bany_fnequal", 1, tbool32
, tfloat
, "{src0} != {src1}",
567 "{src0} || {src1}", "{src}")
568 binop_reduce("ball_iequal", 1, tbool32
, tint
, "{src0} == {src1}",
569 "{src0} && {src1}", "{src}")
570 binop_reduce("bany_inequal", 1, tbool32
, tint
, "{src0} != {src1}",
571 "{src0} || {src1}", "{src}")
573 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
575 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
576 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
577 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
578 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
580 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
581 # and false respectively
583 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
584 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
585 binop("seq", tfloat32
, commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
586 binop("sne", tfloat32
, commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
589 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 << src1")
590 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 >> src1")
591 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], "", "src0 >> src1")
593 # bitwise logic operators
595 # These are also used as boolean and, or, xor for hardware supporting
599 binop("iand", tuint
, commutative
+ associative
, "src0 & src1")
600 binop("ior", tuint
, commutative
+ associative
, "src0 | src1")
601 binop("ixor", tuint
, commutative
+ associative
, "src0 ^ src1")
604 # floating point logic operators
606 # These use (src != 0.0) for testing the truth of the input, and output 1.0
607 # for true and 0.0 for false
609 binop("fand", tfloat32
, commutative
,
610 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
611 binop("for", tfloat32
, commutative
,
612 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
613 binop("fxor", tfloat32
, commutative
,
614 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
616 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
619 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
620 "{src0} * {src1}", "{src0} + {src1}", "{src}")
622 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], "",
623 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
624 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], "",
625 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
627 binop("fmin", tfloat
, "", "fminf(src0, src1)")
628 binop("imin", tint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
629 binop("umin", tuint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
630 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
631 binop("imax", tint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
632 binop("umax", tuint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
634 # Saturated vector add for 4 8bit ints.
635 binop("usadd_4x8", tint32
, commutative
+ associative
, """
637 for (int i = 0; i < 32; i += 8) {
638 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
642 # Saturated vector subtract for 4 8bit ints.
643 binop("ussub_4x8", tint32
, "", """
645 for (int i = 0; i < 32; i += 8) {
646 int src0_chan = (src0 >> i) & 0xff;
647 int src1_chan = (src1 >> i) & 0xff;
648 if (src0_chan > src1_chan)
649 dst |= (src0_chan - src1_chan) << i;
653 # vector min for 4 8bit ints.
654 binop("umin_4x8", tint32
, commutative
+ associative
, """
656 for (int i = 0; i < 32; i += 8) {
657 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
661 # vector max for 4 8bit ints.
662 binop("umax_4x8", tint32
, commutative
+ associative
, """
664 for (int i = 0; i < 32; i += 8) {
665 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
669 # unorm multiply: (a * b) / 255.
670 binop("umul_unorm_4x8", tint32
, commutative
+ associative
, """
672 for (int i = 0; i < 32; i += 8) {
673 int src0_chan = (src0 >> i) & 0xff;
674 int src1_chan = (src1 >> i) & 0xff;
675 dst |= ((src0_chan * src1_chan) / 255) << i;
679 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
681 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
682 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
684 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
685 "src0 | ((uint64_t)src1 << 32)")
687 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
688 "src0 | ((uint32_t)src1 << 16)")
690 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
691 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
692 # if either of its arguments are 32.
693 binop_convert("bfm", tuint32
, tint32
, "", """
694 int bits = src0, offset = src1;
695 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
696 dst = 0; /* undefined */
698 dst = ((1u << bits) - 1) << offset;
701 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], "", """
702 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
703 /* flush denormals to zero. */
705 dst = copysignf(0.0f, src0);
708 # Combines the first component of each input to make a 2-component vector.
710 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
716 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
717 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
720 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
721 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
724 def triop(name
, ty
, const_expr
):
725 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], "", const_expr
)
726 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
727 opcode(name
, output_size
, tuint
,
728 [src1_size
, src2_size
, src3_size
],
729 [tuint
, tuint
, tuint
], "", const_expr
)
731 triop("ffma", tfloat
, "src0 * src1 + src2")
733 triop("flrp", tfloat
, "src0 * (1 - src2) + src1 * src2")
737 # A vector conditional select instruction (like ?:, but operating per-
738 # component on vectors). There are two versions, one for floating point
739 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
742 triop("fcsel", tfloat32
, "(src0 != 0.0f) ? src1 : src2")
745 triop("fmin3", tfloat
, "fminf(src0, fminf(src1, src2))")
746 triop("imin3", tint
, "MIN2(src0, MIN2(src1, src2))")
747 triop("umin3", tuint
, "MIN2(src0, MIN2(src1, src2))")
749 triop("fmax3", tfloat
, "fmaxf(src0, fmaxf(src1, src2))")
750 triop("imax3", tint
, "MAX2(src0, MAX2(src1, src2))")
751 triop("umax3", tuint
, "MAX2(src0, MAX2(src1, src2))")
753 triop("fmed3", tfloat
, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
754 triop("imed3", tint
, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
755 triop("umed3", tuint
, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
757 opcode("bcsel", 0, tuint
, [0, 0, 0],
758 [tbool32
, tuint
, tuint
], "", "src0 ? src1 : src2")
761 triop("bfi", tuint32
, """
762 unsigned mask = src0, insert = src1, base = src2;
771 dst = (base & ~mask) | (insert & mask);
775 # SM5 ubfe/ibfe assembly
776 opcode("ubfe", 0, tuint32
,
777 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
778 unsigned base = src0;
779 int offset = src1, bits = src2;
782 } else if (bits < 0 || offset < 0) {
783 dst = 0; /* undefined */
784 } else if (offset + bits < 32) {
785 dst = (base << (32 - bits - offset)) >> (32 - bits);
787 dst = base >> offset;
790 opcode("ibfe", 0, tint32
,
791 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
793 int offset = src1, bits = src2;
796 } else if (bits < 0 || offset < 0) {
797 dst = 0; /* undefined */
798 } else if (offset + bits < 32) {
799 dst = (base << (32 - bits - offset)) >> (32 - bits);
801 dst = base >> offset;
805 # GLSL bitfieldExtract()
806 opcode("ubitfield_extract", 0, tuint32
,
807 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
808 unsigned base = src0;
809 int offset = src1, bits = src2;
812 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
813 dst = 0; /* undefined per the spec */
815 dst = (base >> offset) & ((1ull << bits) - 1);
818 opcode("ibitfield_extract", 0, tint32
,
819 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
821 int offset = src1, bits = src2;
824 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
827 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
831 # Combines the first component of each input to make a 3-component vector.
833 triop_horiz("vec3", 3, 1, 1, 1, """
839 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
840 src4_size
, const_expr
):
841 opcode(name
, output_size
, tuint
,
842 [src1_size
, src2_size
, src3_size
, src4_size
],
843 [tuint
, tuint
, tuint
, tuint
],
846 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
847 [tuint32
, tuint32
, tint32
, tint32
], "", """
848 unsigned base = src0, insert = src1;
849 int offset = src2, bits = src3;
852 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
855 unsigned mask = ((1ull << bits) - 1) << offset;
856 dst = (base & ~mask) | ((insert << offset) & mask);
860 quadop_horiz("vec4", 4, 1, 1, 1, 1, """