2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
27 # Class that represents all the information we have about the opcode
28 # NOTE: this must be kept in sync with nir_op_info
31 """Class that represents all the information we have about the opcode
32 NOTE: this must be kept in sync with nir_op_info
34 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
35 input_types
, algebraic_properties
, const_expr
):
38 - name is the name of the opcode (prepend nir_op_ for the enum name)
39 - all types are strings that get nir_type_ prepended to them
40 - input_types is a list of types
41 - algebraic_properties is a space-seperated string, where nir_op_is_ is
42 prepended before each entry
43 - const_expr is an expression or series of statements that computes the
44 constant value of the opcode given the constant values of its inputs.
46 Constant expressions are formed from the variables src0, src1, ...,
47 src(N-1), where N is the number of arguments. The output of the
48 expression should be stored in the dst variable. Per-component input
49 and output variables will be scalars and non-per-component input and
50 output variables will be a struct with fields named x, y, z, and w
51 all of the correct type. Input and output variables can be assumed
52 to already be of the correct type and need no conversion. In
53 particular, the conversion from the C bool type to/from NIR_TRUE and
54 NIR_FALSE happens automatically.
56 For per-component instructions, the entire expression will be
57 executed once for each component. For non-per-component
58 instructions, the expression is expected to store the correct values
59 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
60 constant expression, an assignment to dst will happen automatically
61 and the result will be equivalent to "dst = <expression>" for
62 per-component instructions and "dst.x = dst.y = ... = <expression>"
63 for non-per-component instructions.
65 assert isinstance(name
, str)
66 assert isinstance(output_size
, int)
67 assert isinstance(output_type
, str)
68 assert isinstance(input_sizes
, list)
69 assert isinstance(input_sizes
[0], int)
70 assert isinstance(input_types
, list)
71 assert isinstance(input_types
[0], str)
72 assert isinstance(algebraic_properties
, str)
73 assert isinstance(const_expr
, str)
74 assert len(input_sizes
) == len(input_types
)
75 assert 0 <= output_size
<= 4
76 for size
in input_sizes
:
81 self
.num_inputs
= len(input_sizes
)
82 self
.output_size
= output_size
83 self
.output_type
= output_type
84 self
.input_sizes
= input_sizes
85 self
.input_types
= input_types
86 self
.algebraic_properties
= algebraic_properties
87 self
.const_expr
= const_expr
89 # helper variables for strings
102 commutative
= "commutative "
103 associative
= "associative "
105 # global dictionary of opcodes
108 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
109 algebraic_properties
, const_expr
):
110 assert name
not in opcodes
111 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
112 input_types
, algebraic_properties
, const_expr
)
114 def unop_convert(name
, out_type
, in_type
, const_expr
):
115 opcode(name
, 0, out_type
, [0], [in_type
], "", const_expr
)
117 def unop(name
, ty
, const_expr
):
118 opcode(name
, 0, ty
, [0], [ty
], "", const_expr
)
120 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
122 opcode(name
, output_size
, output_type
, [input_size
], [input_type
], "",
125 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
126 reduce_expr
, final_expr
):
128 return "(" + prereduce_expr
.format(src
=src
) + ")"
130 return final_expr
.format(src
="(" + src
+ ")")
131 def reduce_(src0
, src1
):
132 return reduce_expr
.format(src0
=src0
, src1
=src1
)
133 src0
= prereduce("src0.x")
134 src1
= prereduce("src0.y")
135 src2
= prereduce("src0.z")
136 src3
= prereduce("src0.w")
137 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
138 final(reduce_(src0
, src1
)))
139 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
140 final(reduce_(reduce_(src0
, src1
), src2
)))
141 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
142 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
145 # These two move instructions differ in what modifiers they support and what
146 # the negate modifier means. Otherwise, they are identical.
147 unop("fmov", tfloat
, "src0")
148 unop("imov", tint
, "src0")
150 unop("ineg", tint
, "-src0")
151 unop("fneg", tfloat
, "-src0")
152 unop("inot", tint
, "~src0") # invert every bit of the integer
153 unop("fnot", tfloat
, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
154 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
155 unop("fsign", tfloat
, ("bit_size == 64 ? " +
156 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
157 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
158 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
159 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
160 unop("fabs", tfloat
, "fabs(src0)")
161 unop("fsat", tfloat
, ("bit_size == 64 ? " +
162 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
163 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
164 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
165 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
166 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
167 unop("fexp2", tfloat
, "exp2f(src0)")
168 unop("flog2", tfloat
, "log2f(src0)")
170 # Generate all of the numeric conversion opcodes
171 for src_t
in [tint
, tuint
, tfloat
]:
172 if src_t
in (tint
, tuint
):
173 dst_types
= [tfloat
, src_t
]
174 elif src_t
== tfloat
:
175 dst_types
= [tint
, tuint
, tfloat
]
177 for dst_t
in dst_types
:
179 bit_sizes
= [16, 32, 64]
181 bit_sizes
= [8, 16, 32, 64]
182 for bit_size
in bit_sizes
:
183 if bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
184 rnd_modes
= ['_rtne', '_rtz', '']
185 for rnd_mode
in rnd_modes
:
186 unop_convert("{0}2{1}{2}{3}".format(src_t
[0], dst_t
[0],
188 dst_t
+ str(bit_size
), src_t
, "src0")
190 unop_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
191 dst_t
+ str(bit_size
), src_t
, "src0")
193 # We'll hand-code the to/from bool conversion opcodes. Because bool doesn't
194 # have multiple bit-sizes, we can always infer the size from the other type.
195 unop_convert("f2b", tbool
, tfloat
, "src0 != 0.0")
196 unop_convert("i2b", tbool
, tint
, "src0 != 0")
197 unop_convert("b2f", tfloat
, tbool
, "src0 ? 1.0 : 0.0")
198 unop_convert("b2i", tint
, tbool
, "src0 ? 1 : 0")
201 # Unary floating-point rounding operations.
204 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
205 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
206 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
207 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
208 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
210 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
212 # Trigonometric operations.
215 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
216 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
219 unop_convert("frexp_exp", tint32
, tfloat64
, "frexp(src0, &dst);")
220 unop_convert("frexp_sig", tfloat64
, tfloat64
, "int n; dst = frexp(src0, &n);")
222 # Partial derivatives.
225 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
226 unop("fddy", tfloat
, "0.0")
227 unop("fddx_fine", tfloat
, "0.0")
228 unop("fddy_fine", tfloat
, "0.0")
229 unop("fddx_coarse", tfloat
, "0.0")
230 unop("fddy_coarse", tfloat
, "0.0")
233 # Floating point pack and unpack operations.
236 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
237 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
238 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
239 """.replace("fmt", fmt
))
242 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
243 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
244 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
245 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
246 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
247 """.replace("fmt", fmt
))
249 def unpack_2x16(fmt
):
250 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
251 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
252 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
253 """.replace("fmt", fmt
))
256 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
257 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
258 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
259 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
260 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
261 """.replace("fmt", fmt
))
275 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
276 dst.x = (src0.x & 0xffff) | (src0.y << 16);
279 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
280 dst.x = (src0.x << 0) |
286 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
287 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
289 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
290 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
292 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
293 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
295 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
296 "dst.x = src0.x; dst.y = src0.x >> 32;")
298 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
299 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
301 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
302 "dst.x = src0.x; dst.y = src0.x >> 16;")
304 # Lowered floating point unpacking operations.
307 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32
, 1, tuint32
,
308 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
309 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32
, 1, tuint32
,
310 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
312 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
313 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
315 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
316 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
318 # Bit operations, part of ARB_gpu_shader5.
321 unop("bitfield_reverse", tuint32
, """
322 /* we're not winning any awards for speed here, but that's ok */
324 for (unsigned bit = 0; bit < 32; bit++)
325 dst |= ((src0 >> bit) & 1) << (31 - bit);
327 unop_convert("bit_count", tuint32
, tuint
, """
329 for (unsigned bit = 0; bit < bit_size; bit++) {
330 if ((src0 >> bit) & 1)
335 unop_convert("ufind_msb", tint32
, tuint
, """
337 for (int bit = bit_size - 1; bit >= 0; bit--) {
338 if ((src0 >> bit) & 1) {
345 unop("ifind_msb", tint32
, """
347 for (int bit = 31; bit >= 0; bit--) {
348 /* If src0 < 0, we're looking for the first 0 bit.
349 * if src0 >= 0, we're looking for the first 1 bit.
351 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
352 (!((src0 >> bit) & 1) && (src0 < 0))) {
359 unop_convert("find_lsb", tint32
, tint
, """
361 for (unsigned bit = 0; bit < bit_size; bit++) {
362 if ((src0 >> bit) & 1) {
370 for i
in range(1, 5):
371 for j
in range(1, 5):
372 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
375 # AMD_gcn_shader extended instructions
376 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
378 float absX = fabs(src0.x);
379 float absY = fabs(src0.y);
380 float absZ = fabs(src0.z);
381 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
382 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
383 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
384 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
385 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
386 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
389 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
390 float absX = fabs(src0.x);
391 float absY = fabs(src0.y);
392 float absZ = fabs(src0.z);
393 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
394 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
395 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
396 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
397 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
398 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
402 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
403 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
], alg_props
, const_expr
)
405 def binop(name
, ty
, alg_props
, const_expr
):
406 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
408 def binop_compare(name
, ty
, alg_props
, const_expr
):
409 binop_convert(name
, tbool
, ty
, alg_props
, const_expr
)
411 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
412 src2_type
, const_expr
):
413 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
416 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
417 reduce_expr
, final_expr
):
419 return final_expr
.format(src
= "(" + src
+ ")")
420 def reduce_(src0
, src1
):
421 return reduce_expr
.format(src0
=src0
, src1
=src1
)
422 def prereduce(src0
, src1
):
423 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
424 src0
= prereduce("src0.x", "src1.x")
425 src1
= prereduce("src0.y", "src1.y")
426 src2
= prereduce("src0.z", "src1.z")
427 src3
= prereduce("src0.w", "src1.w")
428 opcode(name
+ "2", output_size
, output_type
,
429 [2, 2], [src_type
, src_type
], commutative
,
430 final(reduce_(src0
, src1
)))
431 opcode(name
+ "3", output_size
, output_type
,
432 [3, 3], [src_type
, src_type
], commutative
,
433 final(reduce_(reduce_(src0
, src1
), src2
)))
434 opcode(name
+ "4", output_size
, output_type
,
435 [4, 4], [src_type
, src_type
], commutative
,
436 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
438 binop("fadd", tfloat
, commutative
+ associative
, "src0 + src1")
439 binop("iadd", tint
, commutative
+ associative
, "src0 + src1")
440 binop("fsub", tfloat
, "", "src0 - src1")
441 binop("isub", tint
, "", "src0 - src1")
443 binop("fmul", tfloat
, commutative
+ associative
, "src0 * src1")
444 # low 32-bits of signed/unsigned integer multiply
445 binop("imul", tint
, commutative
+ associative
, "src0 * src1")
446 # high 32-bits of signed integer multiply
447 binop("imul_high", tint32
, commutative
,
448 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
449 # high 32-bits of unsigned integer multiply
450 binop("umul_high", tuint32
, commutative
,
451 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
453 binop("fdiv", tfloat
, "", "src0 / src1")
454 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
455 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
457 # returns a boolean representing the carry resulting from the addition of
458 # the two unsigned arguments.
460 binop_convert("uadd_carry", tuint
, tuint
, commutative
, "src0 + src1 < src0")
462 # returns a boolean representing the borrow resulting from the subtraction
463 # of the two unsigned arguments.
465 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
467 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
469 # For signed integers, there are several different possible definitions of
470 # "modulus" or "remainder". We follow the conventions used by LLVM and
471 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
472 # operation while the imod opcode implements the more mathematical
473 # "modulus" operation. For details on the difference, see
475 # http://mathforum.org/library/drmath/view/52343.html
477 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
478 binop("imod", tint
, "",
479 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
480 " src0 % src1 : src0 % src1 + src1)")
481 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
482 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
489 # these integer-aware comparisons return a boolean (0 or ~0)
491 binop_compare("flt", tfloat
, "", "src0 < src1")
492 binop_compare("fge", tfloat
, "", "src0 >= src1")
493 binop_compare("feq", tfloat
, commutative
, "src0 == src1")
494 binop_compare("fne", tfloat
, commutative
, "src0 != src1")
495 binop_compare("ilt", tint
, "", "src0 < src1")
496 binop_compare("ige", tint
, "", "src0 >= src1")
497 binop_compare("ieq", tint
, commutative
, "src0 == src1")
498 binop_compare("ine", tint
, commutative
, "src0 != src1")
499 binop_compare("ult", tuint
, "", "src0 < src1")
500 binop_compare("uge", tuint
, "", "src0 >= src1")
502 # integer-aware GLSL-style comparisons that compare floats and ints
504 binop_reduce("ball_fequal", 1, tbool
, tfloat
, "{src0} == {src1}",
505 "{src0} && {src1}", "{src}")
506 binop_reduce("bany_fnequal", 1, tbool
, tfloat
, "{src0} != {src1}",
507 "{src0} || {src1}", "{src}")
508 binop_reduce("ball_iequal", 1, tbool
, tint
, "{src0} == {src1}",
509 "{src0} && {src1}", "{src}")
510 binop_reduce("bany_inequal", 1, tbool
, tint
, "{src0} != {src1}",
511 "{src0} || {src1}", "{src}")
513 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
515 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
516 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
517 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
518 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
520 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
521 # and false respectively
523 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
524 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
525 binop("seq", tfloat32
, commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
526 binop("sne", tfloat32
, commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
529 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 << src1")
530 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 >> src1")
531 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], "", "src0 >> src1")
533 # bitwise logic operators
535 # These are also used as boolean and, or, xor for hardware supporting
539 binop("iand", tuint
, commutative
+ associative
, "src0 & src1")
540 binop("ior", tuint
, commutative
+ associative
, "src0 | src1")
541 binop("ixor", tuint
, commutative
+ associative
, "src0 ^ src1")
544 # floating point logic operators
546 # These use (src != 0.0) for testing the truth of the input, and output 1.0
547 # for true and 0.0 for false
549 binop("fand", tfloat32
, commutative
,
550 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
551 binop("for", tfloat32
, commutative
,
552 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
553 binop("fxor", tfloat32
, commutative
,
554 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
556 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
559 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
560 "{src0} * {src1}", "{src0} + {src1}", "{src}")
562 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], "",
563 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
564 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], "",
565 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
567 binop("fmin", tfloat
, "", "fminf(src0, src1)")
568 binop("imin", tint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
569 binop("umin", tuint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
570 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
571 binop("imax", tint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
572 binop("umax", tuint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
574 # Saturated vector add for 4 8bit ints.
575 binop("usadd_4x8", tint32
, commutative
+ associative
, """
577 for (int i = 0; i < 32; i += 8) {
578 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
582 # Saturated vector subtract for 4 8bit ints.
583 binop("ussub_4x8", tint32
, "", """
585 for (int i = 0; i < 32; i += 8) {
586 int src0_chan = (src0 >> i) & 0xff;
587 int src1_chan = (src1 >> i) & 0xff;
588 if (src0_chan > src1_chan)
589 dst |= (src0_chan - src1_chan) << i;
593 # vector min for 4 8bit ints.
594 binop("umin_4x8", tint32
, commutative
+ associative
, """
596 for (int i = 0; i < 32; i += 8) {
597 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
601 # vector max for 4 8bit ints.
602 binop("umax_4x8", tint32
, commutative
+ associative
, """
604 for (int i = 0; i < 32; i += 8) {
605 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
609 # unorm multiply: (a * b) / 255.
610 binop("umul_unorm_4x8", tint32
, commutative
+ associative
, """
612 for (int i = 0; i < 32; i += 8) {
613 int src0_chan = (src0 >> i) & 0xff;
614 int src1_chan = (src1 >> i) & 0xff;
615 dst |= ((src0_chan * src1_chan) / 255) << i;
619 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
621 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
622 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
624 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
625 "src0 | ((uint64_t)src1 << 32)")
627 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
628 "src0 | ((uint32_t)src1 << 16)")
630 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
631 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
632 # if either of its arguments are 32.
633 binop_convert("bfm", tuint32
, tint32
, "", """
634 int bits = src0, offset = src1;
635 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
636 dst = 0; /* undefined */
638 dst = ((1u << bits) - 1) << offset;
641 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], "", """
642 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
643 /* flush denormals to zero. */
645 dst = copysignf(0.0f, src0);
648 # Combines the first component of each input to make a 2-component vector.
650 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
656 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
657 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
660 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
661 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
664 def triop(name
, ty
, const_expr
):
665 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], "", const_expr
)
666 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
667 opcode(name
, output_size
, tuint
,
668 [src1_size
, src2_size
, src3_size
],
669 [tuint
, tuint
, tuint
], "", const_expr
)
671 triop("ffma", tfloat
, "src0 * src1 + src2")
673 triop("flrp", tfloat
, "src0 * (1 - src2) + src1 * src2")
677 # A vector conditional select instruction (like ?:, but operating per-
678 # component on vectors). There are two versions, one for floating point
679 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
682 triop("fcsel", tfloat32
, "(src0 != 0.0f) ? src1 : src2")
685 triop("fmin3", tfloat
, "fminf(src0, fminf(src1, src2))")
686 triop("imin3", tint
, "MIN2(src0, MIN2(src1, src2))")
687 triop("umin3", tuint
, "MIN2(src0, MIN2(src1, src2))")
689 triop("fmax3", tfloat
, "fmaxf(src0, fmaxf(src1, src2))")
690 triop("imax3", tint
, "MAX2(src0, MAX2(src1, src2))")
691 triop("umax3", tuint
, "MAX2(src0, MAX2(src1, src2))")
693 triop("fmed3", tfloat
, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
694 triop("imed3", tint
, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
695 triop("umed3", tuint
, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
697 opcode("bcsel", 0, tuint
, [0, 0, 0],
698 [tbool
, tuint
, tuint
], "", "src0 ? src1 : src2")
701 triop("bfi", tuint32
, """
702 unsigned mask = src0, insert = src1, base = src2;
711 dst = (base & ~mask) | (insert & mask);
715 # SM5 ubfe/ibfe assembly
716 opcode("ubfe", 0, tuint32
,
717 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
718 unsigned base = src0;
719 int offset = src1, bits = src2;
722 } else if (bits < 0 || offset < 0) {
723 dst = 0; /* undefined */
724 } else if (offset + bits < 32) {
725 dst = (base << (32 - bits - offset)) >> (32 - bits);
727 dst = base >> offset;
730 opcode("ibfe", 0, tint32
,
731 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
733 int offset = src1, bits = src2;
736 } else if (bits < 0 || offset < 0) {
737 dst = 0; /* undefined */
738 } else if (offset + bits < 32) {
739 dst = (base << (32 - bits - offset)) >> (32 - bits);
741 dst = base >> offset;
745 # GLSL bitfieldExtract()
746 opcode("ubitfield_extract", 0, tuint32
,
747 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
748 unsigned base = src0;
749 int offset = src1, bits = src2;
752 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
753 dst = 0; /* undefined per the spec */
755 dst = (base >> offset) & ((1ull << bits) - 1);
758 opcode("ibitfield_extract", 0, tint32
,
759 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
761 int offset = src1, bits = src2;
764 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
767 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
771 # Combines the first component of each input to make a 3-component vector.
773 triop_horiz("vec3", 3, 1, 1, 1, """
779 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
780 src4_size
, const_expr
):
781 opcode(name
, output_size
, tuint
,
782 [src1_size
, src2_size
, src3_size
, src4_size
],
783 [tuint
, tuint
, tuint
, tuint
],
786 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
787 [tuint32
, tuint32
, tint32
, tint32
], "", """
788 unsigned base = src0, insert = src1;
789 int offset = src2, bits = src3;
792 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
795 unsigned mask = ((1ull << bits) - 1) << offset;
796 dst = (base & ~mask) | ((insert << offset) & mask);
800 quadop_horiz("vec4", 4, 1, 1, 1, 1, """