2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
27 # Class that represents all the information we have about the opcode
28 # NOTE: this must be kept in sync with nir_op_info
31 """Class that represents all the information we have about the opcode
32 NOTE: this must be kept in sync with nir_op_info
34 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
35 input_types
, algebraic_properties
, const_expr
):
38 - name is the name of the opcode (prepend nir_op_ for the enum name)
39 - all types are strings that get nir_type_ prepended to them
40 - input_types is a list of types
41 - algebraic_properties is a space-seperated string, where nir_op_is_ is
42 prepended before each entry
43 - const_expr is an expression or series of statements that computes the
44 constant value of the opcode given the constant values of its inputs.
46 Constant expressions are formed from the variables src0, src1, ...,
47 src(N-1), where N is the number of arguments. The output of the
48 expression should be stored in the dst variable. Per-component input
49 and output variables will be scalars and non-per-component input and
50 output variables will be a struct with fields named x, y, z, and w
51 all of the correct type. Input and output variables can be assumed
52 to already be of the correct type and need no conversion. In
53 particular, the conversion from the C bool type to/from NIR_TRUE and
54 NIR_FALSE happens automatically.
56 For per-component instructions, the entire expression will be
57 executed once for each component. For non-per-component
58 instructions, the expression is expected to store the correct values
59 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
60 constant expression, an assignment to dst will happen automatically
61 and the result will be equivalent to "dst = <expression>" for
62 per-component instructions and "dst.x = dst.y = ... = <expression>"
63 for non-per-component instructions.
65 assert isinstance(name
, str)
66 assert isinstance(output_size
, int)
67 assert isinstance(output_type
, str)
68 assert isinstance(input_sizes
, list)
69 assert isinstance(input_sizes
[0], int)
70 assert isinstance(input_types
, list)
71 assert isinstance(input_types
[0], str)
72 assert isinstance(algebraic_properties
, str)
73 assert isinstance(const_expr
, str)
74 assert len(input_sizes
) == len(input_types
)
75 assert 0 <= output_size
<= 4
76 for size
in input_sizes
:
81 self
.num_inputs
= len(input_sizes
)
82 self
.output_size
= output_size
83 self
.output_type
= output_type
84 self
.input_sizes
= input_sizes
85 self
.input_types
= input_types
86 self
.algebraic_properties
= algebraic_properties
87 self
.const_expr
= const_expr
89 # helper variables for strings
101 commutative
= "commutative "
102 associative
= "associative "
104 # global dictionary of opcodes
107 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
108 algebraic_properties
, const_expr
):
109 assert name
not in opcodes
110 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
111 input_types
, algebraic_properties
, const_expr
)
113 def unop_convert(name
, out_type
, in_type
, const_expr
):
114 opcode(name
, 0, out_type
, [0], [in_type
], "", const_expr
)
116 def unop(name
, ty
, const_expr
):
117 opcode(name
, 0, ty
, [0], [ty
], "", const_expr
)
119 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
121 opcode(name
, output_size
, output_type
, [input_size
], [input_type
], "",
124 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
125 reduce_expr
, final_expr
):
127 return "(" + prereduce_expr
.format(src
=src
) + ")"
129 return final_expr
.format(src
="(" + src
+ ")")
130 def reduce_(src0
, src1
):
131 return reduce_expr
.format(src0
=src0
, src1
=src1
)
132 src0
= prereduce("src0.x")
133 src1
= prereduce("src0.y")
134 src2
= prereduce("src0.z")
135 src3
= prereduce("src0.w")
136 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
137 final(reduce_(src0
, src1
)))
138 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
139 final(reduce_(reduce_(src0
, src1
), src2
)))
140 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
141 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
144 # These two move instructions differ in what modifiers they support and what
145 # the negate modifier means. Otherwise, they are identical.
146 unop("fmov", tfloat
, "src0")
147 unop("imov", tint
, "src0")
149 unop("ineg", tint
, "-src0")
150 unop("fneg", tfloat
, "-src0")
151 unop("inot", tint
, "~src0") # invert every bit of the integer
152 unop("fnot", tfloat
, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
153 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
154 unop("fsign", tfloat
, ("bit_size == 64 ? " +
155 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
156 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
157 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
158 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
159 unop("fabs", tfloat
, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
160 unop("fsat", tfloat
, ("bit_size == 64 ? " +
161 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
162 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
163 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
164 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
165 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
166 unop("fexp2", tfloat
, "exp2f(src0)")
167 unop("flog2", tfloat
, "log2f(src0)")
169 # Generate all of the numeric conversion opcodes
170 for src_t
in [tint
, tuint
, tfloat
]:
171 if src_t
in (tint
, tuint
):
172 dst_types
= [tfloat
, src_t
]
173 elif src_t
== tfloat
:
174 dst_types
= [tint
, tuint
, tfloat
]
176 for dst_t
in dst_types
:
178 bit_sizes
= [16, 32, 64]
180 bit_sizes
= [8, 16, 32, 64]
181 for bit_size
in bit_sizes
:
182 unop_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
183 dst_t
+ str(bit_size
), src_t
, "src0")
185 # We'll hand-code the to/from bool conversion opcodes. Because bool doesn't
186 # have multiple bit-sizes, we can always infer the size from the other type.
187 unop_convert("f2b", tbool
, tfloat
, "src0 != 0.0")
188 unop_convert("i2b", tbool
, tint
, "src0 != 0")
189 unop_convert("b2f", tfloat
, tbool
, "src0 ? 1.0 : 0.0")
190 unop_convert("b2i", tint
, tbool
, "src0 ? 1 : 0")
193 # Unary floating-point rounding operations.
196 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
197 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
198 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
199 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
200 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
202 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
204 # Trigonometric operations.
207 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
208 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
211 # Partial derivatives.
214 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
215 unop("fddy", tfloat
, "0.0")
216 unop("fddx_fine", tfloat
, "0.0")
217 unop("fddy_fine", tfloat
, "0.0")
218 unop("fddx_coarse", tfloat
, "0.0")
219 unop("fddy_coarse", tfloat
, "0.0")
222 # Floating point pack and unpack operations.
225 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
226 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
227 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
228 """.replace("fmt", fmt
))
231 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
232 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
233 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
234 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
235 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
236 """.replace("fmt", fmt
))
238 def unpack_2x16(fmt
):
239 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
240 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
241 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
242 """.replace("fmt", fmt
))
245 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
246 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
247 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
248 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
249 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
250 """.replace("fmt", fmt
))
264 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
265 dst.x = (src0.x & 0xffff) | (src0.y << 16);
268 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
269 dst.x = (src0.x << 0) |
275 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
276 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
278 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
279 "dst.x = src0.x; dst.y = src0.x >> 32;")
281 # Lowered floating point unpacking operations.
284 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32
, 1, tuint32
,
285 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
286 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32
, 1, tuint32
,
287 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
289 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
290 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
292 # Bit operations, part of ARB_gpu_shader5.
295 unop("bitfield_reverse", tuint32
, """
296 /* we're not winning any awards for speed here, but that's ok */
298 for (unsigned bit = 0; bit < 32; bit++)
299 dst |= ((src0 >> bit) & 1) << (31 - bit);
301 unop("bit_count", tuint32
, """
303 for (unsigned bit = 0; bit < 32; bit++) {
304 if ((src0 >> bit) & 1)
309 unop_convert("ufind_msb", tint32
, tuint32
, """
311 for (int bit = 31; bit > 0; bit--) {
312 if ((src0 >> bit) & 1) {
319 unop("ifind_msb", tint32
, """
321 for (int bit = 31; bit >= 0; bit--) {
322 /* If src0 < 0, we're looking for the first 0 bit.
323 * if src0 >= 0, we're looking for the first 1 bit.
325 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
326 (!((src0 >> bit) & 1) && (src0 < 0))) {
333 unop("find_lsb", tint32
, """
335 for (unsigned bit = 0; bit < 32; bit++) {
336 if ((src0 >> bit) & 1) {
344 for i
in xrange(1, 5):
345 for j
in xrange(1, 5):
346 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
348 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
349 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
], alg_props
, const_expr
)
351 def binop(name
, ty
, alg_props
, const_expr
):
352 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
354 def binop_compare(name
, ty
, alg_props
, const_expr
):
355 binop_convert(name
, tbool
, ty
, alg_props
, const_expr
)
357 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
358 src2_type
, const_expr
):
359 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
362 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
363 reduce_expr
, final_expr
):
365 return final_expr
.format(src
= "(" + src
+ ")")
366 def reduce_(src0
, src1
):
367 return reduce_expr
.format(src0
=src0
, src1
=src1
)
368 def prereduce(src0
, src1
):
369 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
370 src0
= prereduce("src0.x", "src1.x")
371 src1
= prereduce("src0.y", "src1.y")
372 src2
= prereduce("src0.z", "src1.z")
373 src3
= prereduce("src0.w", "src1.w")
374 opcode(name
+ "2", output_size
, output_type
,
375 [2, 2], [src_type
, src_type
], commutative
,
376 final(reduce_(src0
, src1
)))
377 opcode(name
+ "3", output_size
, output_type
,
378 [3, 3], [src_type
, src_type
], commutative
,
379 final(reduce_(reduce_(src0
, src1
), src2
)))
380 opcode(name
+ "4", output_size
, output_type
,
381 [4, 4], [src_type
, src_type
], commutative
,
382 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
384 binop("fadd", tfloat
, commutative
+ associative
, "src0 + src1")
385 binop("iadd", tint
, commutative
+ associative
, "src0 + src1")
386 binop("fsub", tfloat
, "", "src0 - src1")
387 binop("isub", tint
, "", "src0 - src1")
389 binop("fmul", tfloat
, commutative
+ associative
, "src0 * src1")
390 # low 32-bits of signed/unsigned integer multiply
391 binop("imul", tint
, commutative
+ associative
, "src0 * src1")
392 # high 32-bits of signed integer multiply
393 binop("imul_high", tint32
, commutative
,
394 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
395 # high 32-bits of unsigned integer multiply
396 binop("umul_high", tuint32
, commutative
,
397 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
399 binop("fdiv", tfloat
, "", "src0 / src1")
400 binop("idiv", tint
, "", "src0 / src1")
401 binop("udiv", tuint
, "", "src0 / src1")
403 # returns a boolean representing the carry resulting from the addition of
404 # the two unsigned arguments.
406 binop_convert("uadd_carry", tuint
, tuint
, commutative
, "src0 + src1 < src0")
408 # returns a boolean representing the borrow resulting from the subtraction
409 # of the two unsigned arguments.
411 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
413 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
415 # For signed integers, there are several different possible definitions of
416 # "modulus" or "remainder". We follow the conventions used by LLVM and
417 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
418 # operation while the imod opcode implements the more mathematical
419 # "modulus" operation. For details on the difference, see
421 # http://mathforum.org/library/drmath/view/52343.html
423 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
424 binop("imod", tint
, "",
425 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
426 " src0 % src1 : src0 % src1 + src1)")
427 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
428 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
435 # these integer-aware comparisons return a boolean (0 or ~0)
437 binop_compare("flt", tfloat
, "", "src0 < src1")
438 binop_compare("fge", tfloat
, "", "src0 >= src1")
439 binop_compare("feq", tfloat
, commutative
, "src0 == src1")
440 binop_compare("fne", tfloat
, commutative
, "src0 != src1")
441 binop_compare("ilt", tint
, "", "src0 < src1")
442 binop_compare("ige", tint
, "", "src0 >= src1")
443 binop_compare("ieq", tint
, commutative
, "src0 == src1")
444 binop_compare("ine", tint
, commutative
, "src0 != src1")
445 binop_compare("ult", tuint
, "", "src0 < src1")
446 binop_compare("uge", tuint
, "", "src0 >= src1")
448 # integer-aware GLSL-style comparisons that compare floats and ints
450 binop_reduce("ball_fequal", 1, tbool
, tfloat
, "{src0} == {src1}",
451 "{src0} && {src1}", "{src}")
452 binop_reduce("bany_fnequal", 1, tbool
, tfloat
, "{src0} != {src1}",
453 "{src0} || {src1}", "{src}")
454 binop_reduce("ball_iequal", 1, tbool
, tint
, "{src0} == {src1}",
455 "{src0} && {src1}", "{src}")
456 binop_reduce("bany_inequal", 1, tbool
, tint
, "{src0} != {src1}",
457 "{src0} || {src1}", "{src}")
459 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
461 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
462 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
463 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
464 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
466 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
467 # and false respectively
469 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
470 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
471 binop("seq", tfloat32
, commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
472 binop("sne", tfloat32
, commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
475 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 << src1")
476 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 >> src1")
477 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], "", "src0 >> src1")
479 # bitwise logic operators
481 # These are also used as boolean and, or, xor for hardware supporting
485 binop("iand", tuint
, commutative
+ associative
, "src0 & src1")
486 binop("ior", tuint
, commutative
+ associative
, "src0 | src1")
487 binop("ixor", tuint
, commutative
+ associative
, "src0 ^ src1")
490 # floating point logic operators
492 # These use (src != 0.0) for testing the truth of the input, and output 1.0
493 # for true and 0.0 for false
495 binop("fand", tfloat32
, commutative
,
496 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
497 binop("for", tfloat32
, commutative
,
498 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
499 binop("fxor", tfloat32
, commutative
,
500 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
502 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
505 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
506 "{src0} * {src1}", "{src0} + {src1}", "{src}")
508 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], "",
509 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
510 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], "",
511 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
513 binop("fmin", tfloat
, "", "fminf(src0, src1)")
514 binop("imin", tint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
515 binop("umin", tuint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
516 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
517 binop("imax", tint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
518 binop("umax", tuint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
520 # Saturated vector add for 4 8bit ints.
521 binop("usadd_4x8", tint32
, commutative
+ associative
, """
523 for (int i = 0; i < 32; i += 8) {
524 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
528 # Saturated vector subtract for 4 8bit ints.
529 binop("ussub_4x8", tint32
, "", """
531 for (int i = 0; i < 32; i += 8) {
532 int src0_chan = (src0 >> i) & 0xff;
533 int src1_chan = (src1 >> i) & 0xff;
534 if (src0_chan > src1_chan)
535 dst |= (src0_chan - src1_chan) << i;
539 # vector min for 4 8bit ints.
540 binop("umin_4x8", tint32
, commutative
+ associative
, """
542 for (int i = 0; i < 32; i += 8) {
543 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
547 # vector max for 4 8bit ints.
548 binop("umax_4x8", tint32
, commutative
+ associative
, """
550 for (int i = 0; i < 32; i += 8) {
551 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
555 # unorm multiply: (a * b) / 255.
556 binop("umul_unorm_4x8", tint32
, commutative
+ associative
, """
558 for (int i = 0; i < 32; i += 8) {
559 int src0_chan = (src0 >> i) & 0xff;
560 int src1_chan = (src1 >> i) & 0xff;
561 dst |= ((src0_chan * src1_chan) / 255) << i;
565 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
567 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
568 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
570 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
571 "src0 | ((uint64_t)src1 << 32)")
573 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
574 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
575 # if either of its arguments are 32.
576 binop_convert("bfm", tuint32
, tint32
, "", """
577 int bits = src0, offset = src1;
578 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
579 dst = 0; /* undefined */
581 dst = ((1u << bits) - 1) << offset;
584 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], "", """
585 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
586 /* flush denormals to zero. */
588 dst = copysignf(0.0f, src0);
591 # Combines the first component of each input to make a 2-component vector.
593 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
599 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
600 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
603 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
604 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
607 def triop(name
, ty
, const_expr
):
608 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], "", const_expr
)
609 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
610 opcode(name
, output_size
, tuint
,
611 [src1_size
, src2_size
, src3_size
],
612 [tuint
, tuint
, tuint
], "", const_expr
)
614 triop("ffma", tfloat
, "src0 * src1 + src2")
616 triop("flrp", tfloat
, "src0 * (1 - src2) + src1 * src2")
620 # A vector conditional select instruction (like ?:, but operating per-
621 # component on vectors). There are two versions, one for floating point
622 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
625 triop("fcsel", tfloat32
, "(src0 != 0.0f) ? src1 : src2")
626 opcode("bcsel", 0, tuint
, [0, 0, 0],
627 [tbool
, tuint
, tuint
], "", "src0 ? src1 : src2")
630 triop("bfi", tuint32
, """
631 unsigned mask = src0, insert = src1, base = src2;
640 dst = (base & ~mask) | (insert & mask);
644 # SM5 ubfe/ibfe assembly
645 opcode("ubfe", 0, tuint32
,
646 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
647 unsigned base = src0;
648 int offset = src1, bits = src2;
651 } else if (bits < 0 || offset < 0) {
652 dst = 0; /* undefined */
653 } else if (offset + bits < 32) {
654 dst = (base << (32 - bits - offset)) >> (32 - bits);
656 dst = base >> offset;
659 opcode("ibfe", 0, tint32
,
660 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
662 int offset = src1, bits = src2;
665 } else if (bits < 0 || offset < 0) {
666 dst = 0; /* undefined */
667 } else if (offset + bits < 32) {
668 dst = (base << (32 - bits - offset)) >> (32 - bits);
670 dst = base >> offset;
674 # GLSL bitfieldExtract()
675 opcode("ubitfield_extract", 0, tuint32
,
676 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
677 unsigned base = src0;
678 int offset = src1, bits = src2;
681 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
682 dst = 0; /* undefined per the spec */
684 dst = (base >> offset) & ((1ull << bits) - 1);
687 opcode("ibitfield_extract", 0, tint32
,
688 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
690 int offset = src1, bits = src2;
693 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
696 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
700 # Combines the first component of each input to make a 3-component vector.
702 triop_horiz("vec3", 3, 1, 1, 1, """
708 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
709 src4_size
, const_expr
):
710 opcode(name
, output_size
, tuint
,
711 [src1_size
, src2_size
, src3_size
, src4_size
],
712 [tuint
, tuint
, tuint
, tuint
],
715 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
716 [tuint32
, tuint32
, tint32
, tint32
], "", """
717 unsigned base = src0, insert = src1;
718 int offset = src2, bits = src3;
721 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
724 unsigned mask = ((1ull << bits) - 1) << offset;
725 dst = (base & ~mask) | ((insert << bits) & mask);
729 quadop_horiz("vec4", 4, 1, 1, 1, 1, """