2 # Copyright (C) 2014 Connor Abbott
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 # Connor Abbott (cwabbott0@gmail.com)
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
35 def __init__(self
, name
, output_size
, output_type
, input_sizes
,
36 input_types
, algebraic_properties
, const_expr
):
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
66 assert isinstance(name
, str)
67 assert isinstance(output_size
, int)
68 assert isinstance(output_type
, str)
69 assert isinstance(input_sizes
, list)
70 assert isinstance(input_sizes
[0], int)
71 assert isinstance(input_types
, list)
72 assert isinstance(input_types
[0], str)
73 assert isinstance(algebraic_properties
, str)
74 assert isinstance(const_expr
, str)
75 assert len(input_sizes
) == len(input_types
)
76 assert 0 <= output_size
<= 4
77 for size
in input_sizes
:
82 self
.num_inputs
= len(input_sizes
)
83 self
.output_size
= output_size
84 self
.output_type
= output_type
85 self
.input_sizes
= input_sizes
86 self
.input_types
= input_types
87 self
.algebraic_properties
= algebraic_properties
88 self
.const_expr
= const_expr
90 # helper variables for strings
105 _TYPE_SPLIT_RE
= re
.compile(r
'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
107 def type_has_size(type_
):
108 m
= _TYPE_SPLIT_RE
.match(type_
)
109 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
110 return m
.group('bits') is not None
112 def type_size(type_
):
113 m
= _TYPE_SPLIT_RE
.match(type_
)
114 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
115 assert m
.group('bits') is not None, \
116 'NIR type string has no bit size: "{}"'.format(type_
)
117 return int(m
.group('bits'))
119 def type_sizes(type_
):
120 if type_has_size(type_
):
121 return [type_size(type_
)]
122 elif type_
== 'bool':
124 elif type_
== 'float':
127 return [1, 8, 16, 32, 64]
129 def type_base_type(type_
):
130 m
= _TYPE_SPLIT_RE
.match(type_
)
131 assert m
is not None, 'Invalid NIR type string: "{}"'.format(type_
)
132 return m
.group('type')
134 commutative
= "commutative "
135 associative
= "associative "
137 # global dictionary of opcodes
140 def opcode(name
, output_size
, output_type
, input_sizes
, input_types
,
141 algebraic_properties
, const_expr
):
142 assert name
not in opcodes
143 opcodes
[name
] = Opcode(name
, output_size
, output_type
, input_sizes
,
144 input_types
, algebraic_properties
, const_expr
)
146 def unop_convert(name
, out_type
, in_type
, const_expr
):
147 opcode(name
, 0, out_type
, [0], [in_type
], "", const_expr
)
149 def unop(name
, ty
, const_expr
):
150 opcode(name
, 0, ty
, [0], [ty
], "", const_expr
)
152 def unop_horiz(name
, output_size
, output_type
, input_size
, input_type
,
154 opcode(name
, output_size
, output_type
, [input_size
], [input_type
], "",
157 def unop_reduce(name
, output_size
, output_type
, input_type
, prereduce_expr
,
158 reduce_expr
, final_expr
):
160 return "(" + prereduce_expr
.format(src
=src
) + ")"
162 return final_expr
.format(src
="(" + src
+ ")")
163 def reduce_(src0
, src1
):
164 return reduce_expr
.format(src0
=src0
, src1
=src1
)
165 src0
= prereduce("src0.x")
166 src1
= prereduce("src0.y")
167 src2
= prereduce("src0.z")
168 src3
= prereduce("src0.w")
169 unop_horiz(name
+ "2", output_size
, output_type
, 2, input_type
,
170 final(reduce_(src0
, src1
)))
171 unop_horiz(name
+ "3", output_size
, output_type
, 3, input_type
,
172 final(reduce_(reduce_(src0
, src1
), src2
)))
173 unop_horiz(name
+ "4", output_size
, output_type
, 4, input_type
,
174 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
177 # These two move instructions differ in what modifiers they support and what
178 # the negate modifier means. Otherwise, they are identical.
179 unop("fmov", tfloat
, "src0")
180 unop("imov", tint
, "src0")
182 unop("ineg", tint
, "-src0")
183 unop("fneg", tfloat
, "-src0")
184 unop("inot", tint
, "~src0") # invert every bit of the integer
185 unop("fnot", tfloat
, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
186 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
187 unop("fsign", tfloat
, ("bit_size == 64 ? " +
188 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
189 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
190 unop("isign", tint
, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
191 unop("iabs", tint
, "(src0 < 0) ? -src0 : src0")
192 unop("fabs", tfloat
, "fabs(src0)")
193 unop("fsat", tfloat
, ("bit_size == 64 ? " +
194 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
195 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
196 unop("frcp", tfloat
, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
197 unop("frsq", tfloat
, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
198 unop("fsqrt", tfloat
, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
199 unop("fexp2", tfloat
, "exp2f(src0)")
200 unop("flog2", tfloat
, "log2f(src0)")
202 # Generate all of the numeric conversion opcodes
203 for src_t
in [tint
, tuint
, tfloat
, tbool
]:
205 dst_types
= [tfloat
, tint
]
207 dst_types
= [tfloat
, tint
, tbool
]
209 dst_types
= [tfloat
, tuint
]
210 elif src_t
== tfloat
:
211 dst_types
= [tint
, tuint
, tfloat
, tbool
]
213 for dst_t
in dst_types
:
214 for bit_size
in type_sizes(dst_t
):
215 if bit_size
== 16 and dst_t
== tfloat
and src_t
== tfloat
:
216 rnd_modes
= ['_rtne', '_rtz', '']
217 for rnd_mode
in rnd_modes
:
218 unop_convert("{0}2{1}{2}{3}".format(src_t
[0], dst_t
[0],
220 dst_t
+ str(bit_size
), src_t
, "src0")
222 conv_expr
= "src0 != 0" if dst_t
== tbool
else "src0"
223 unop_convert("{0}2{1}{2}".format(src_t
[0], dst_t
[0], bit_size
),
224 dst_t
+ str(bit_size
), src_t
, conv_expr
)
227 # Unary floating-point rounding operations.
230 unop("ftrunc", tfloat
, "bit_size == 64 ? trunc(src0) : truncf(src0)")
231 unop("fceil", tfloat
, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
232 unop("ffloor", tfloat
, "bit_size == 64 ? floor(src0) : floorf(src0)")
233 unop("ffract", tfloat
, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
234 unop("fround_even", tfloat
, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
236 unop("fquantize2f16", tfloat
, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
238 # Trigonometric operations.
241 unop("fsin", tfloat
, "bit_size == 64 ? sin(src0) : sinf(src0)")
242 unop("fcos", tfloat
, "bit_size == 64 ? cos(src0) : cosf(src0)")
245 unop_convert("frexp_exp", tint32
, tfloat64
, "frexp(src0, &dst);")
246 unop_convert("frexp_sig", tfloat64
, tfloat64
, "int n; dst = frexp(src0, &n);")
248 # Partial derivatives.
251 unop("fddx", tfloat
, "0.0") # the derivative of a constant is 0.
252 unop("fddy", tfloat
, "0.0")
253 unop("fddx_fine", tfloat
, "0.0")
254 unop("fddy_fine", tfloat
, "0.0")
255 unop("fddx_coarse", tfloat
, "0.0")
256 unop("fddy_coarse", tfloat
, "0.0")
259 # Floating point pack and unpack operations.
262 unop_horiz("pack_" + fmt
+ "_2x16", 1, tuint32
, 2, tfloat32
, """
263 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
264 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
265 """.replace("fmt", fmt
))
268 unop_horiz("pack_" + fmt
+ "_4x8", 1, tuint32
, 4, tfloat32
, """
269 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
270 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
271 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
272 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
273 """.replace("fmt", fmt
))
275 def unpack_2x16(fmt
):
276 unop_horiz("unpack_" + fmt
+ "_2x16", 2, tfloat32
, 1, tuint32
, """
277 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
278 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
279 """.replace("fmt", fmt
))
282 unop_horiz("unpack_" + fmt
+ "_4x8", 4, tfloat32
, 1, tuint32
, """
283 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
284 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
285 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
286 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
287 """.replace("fmt", fmt
))
301 unop_horiz("pack_uvec2_to_uint", 1, tuint32
, 2, tuint32
, """
302 dst.x = (src0.x & 0xffff) | (src0.y << 16);
305 unop_horiz("pack_uvec4_to_uint", 1, tuint32
, 4, tuint32
, """
306 dst.x = (src0.x << 0) |
312 unop_horiz("pack_32_2x16", 1, tuint32
, 2, tuint16
,
313 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
315 unop_horiz("pack_64_2x32", 1, tuint64
, 2, tuint32
,
316 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
318 unop_horiz("pack_64_4x16", 1, tuint64
, 4, tuint16
,
319 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
321 unop_horiz("unpack_64_2x32", 2, tuint32
, 1, tuint64
,
322 "dst.x = src0.x; dst.y = src0.x >> 32;")
324 unop_horiz("unpack_64_4x16", 4, tuint16
, 1, tuint64
,
325 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
327 unop_horiz("unpack_32_2x16", 2, tuint16
, 1, tuint32
,
328 "dst.x = src0.x; dst.y = src0.x >> 16;")
330 # Lowered floating point unpacking operations.
333 unop_convert("unpack_half_2x16_split_x", tfloat32
, tuint32
,
334 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
335 unop_convert("unpack_half_2x16_split_y", tfloat32
, tuint32
,
336 "unpack_half_1x16((uint16_t)(src0 >> 16))")
338 unop_convert("unpack_32_2x16_split_x", tuint16
, tuint32
, "src0")
339 unop_convert("unpack_32_2x16_split_y", tuint16
, tuint32
, "src0 >> 16")
341 unop_convert("unpack_64_2x32_split_x", tuint32
, tuint64
, "src0")
342 unop_convert("unpack_64_2x32_split_y", tuint32
, tuint64
, "src0 >> 32")
344 # Bit operations, part of ARB_gpu_shader5.
347 unop("bitfield_reverse", tuint32
, """
348 /* we're not winning any awards for speed here, but that's ok */
350 for (unsigned bit = 0; bit < 32; bit++)
351 dst |= ((src0 >> bit) & 1) << (31 - bit);
353 unop_convert("bit_count", tuint32
, tuint
, """
355 for (unsigned bit = 0; bit < bit_size; bit++) {
356 if ((src0 >> bit) & 1)
361 unop_convert("ufind_msb", tint32
, tuint
, """
363 for (int bit = bit_size - 1; bit >= 0; bit--) {
364 if ((src0 >> bit) & 1) {
371 unop("ifind_msb", tint32
, """
373 for (int bit = 31; bit >= 0; bit--) {
374 /* If src0 < 0, we're looking for the first 0 bit.
375 * if src0 >= 0, we're looking for the first 1 bit.
377 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
378 (!((src0 >> bit) & 1) && (src0 < 0))) {
385 unop_convert("find_lsb", tint32
, tint
, """
387 for (unsigned bit = 0; bit < bit_size; bit++) {
388 if ((src0 >> bit) & 1) {
396 for i
in range(1, 5):
397 for j
in range(1, 5):
398 unop_horiz("fnoise{0}_{1}".format(i
, j
), i
, tfloat
, j
, tfloat
, "0.0f")
401 # AMD_gcn_shader extended instructions
402 unop_horiz("cube_face_coord", 2, tfloat32
, 3, tfloat32
, """
404 float absX = fabs(src0.x);
405 float absY = fabs(src0.y);
406 float absZ = fabs(src0.z);
407 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
408 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
409 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
410 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
411 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
412 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
415 unop_horiz("cube_face_index", 1, tfloat32
, 3, tfloat32
, """
416 float absX = fabs(src0.x);
417 float absY = fabs(src0.y);
418 float absZ = fabs(src0.z);
419 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
420 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
421 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
422 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
423 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
424 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
428 def binop_convert(name
, out_type
, in_type
, alg_props
, const_expr
):
429 opcode(name
, 0, out_type
, [0, 0], [in_type
, in_type
], alg_props
, const_expr
)
431 def binop(name
, ty
, alg_props
, const_expr
):
432 binop_convert(name
, ty
, ty
, alg_props
, const_expr
)
434 def binop_compare(name
, ty
, alg_props
, const_expr
):
435 binop_convert(name
, tbool1
, ty
, alg_props
, const_expr
)
437 def binop_compare32(name
, ty
, alg_props
, const_expr
):
438 binop_convert(name
, tbool32
, ty
, alg_props
, const_expr
)
440 def binop_horiz(name
, out_size
, out_type
, src1_size
, src1_type
, src2_size
,
441 src2_type
, const_expr
):
442 opcode(name
, out_size
, out_type
, [src1_size
, src2_size
], [src1_type
, src2_type
],
445 def binop_reduce(name
, output_size
, output_type
, src_type
, prereduce_expr
,
446 reduce_expr
, final_expr
):
448 return final_expr
.format(src
= "(" + src
+ ")")
449 def reduce_(src0
, src1
):
450 return reduce_expr
.format(src0
=src0
, src1
=src1
)
451 def prereduce(src0
, src1
):
452 return "(" + prereduce_expr
.format(src0
=src0
, src1
=src1
) + ")"
453 src0
= prereduce("src0.x", "src1.x")
454 src1
= prereduce("src0.y", "src1.y")
455 src2
= prereduce("src0.z", "src1.z")
456 src3
= prereduce("src0.w", "src1.w")
457 opcode(name
+ "2", output_size
, output_type
,
458 [2, 2], [src_type
, src_type
], commutative
,
459 final(reduce_(src0
, src1
)))
460 opcode(name
+ "3", output_size
, output_type
,
461 [3, 3], [src_type
, src_type
], commutative
,
462 final(reduce_(reduce_(src0
, src1
), src2
)))
463 opcode(name
+ "4", output_size
, output_type
,
464 [4, 4], [src_type
, src_type
], commutative
,
465 final(reduce_(reduce_(src0
, src1
), reduce_(src2
, src3
))))
467 binop("fadd", tfloat
, commutative
+ associative
, "src0 + src1")
468 binop("iadd", tint
, commutative
+ associative
, "src0 + src1")
469 binop("iadd_sat", tint
, commutative
+ associative
, """
471 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
472 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
474 binop("uadd_sat", tuint
, commutative
,
475 "(src0 + src1) < src0 ? UINT64_MAX : (src0 + src1)")
476 binop("isub_sat", tint
, "", """
478 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
479 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
481 binop("usub_sat", tuint
, "", "src0 < src1 ? 0 : src0 - src1")
483 binop("fsub", tfloat
, "", "src0 - src1")
484 binop("isub", tint
, "", "src0 - src1")
486 binop("fmul", tfloat
, commutative
+ associative
, "src0 * src1")
487 # low 32-bits of signed/unsigned integer multiply
488 binop("imul", tint
, commutative
+ associative
, "src0 * src1")
490 # Generate 64 bit result from 2 32 bits quantity
491 binop_convert("imul_2x32_64", tint64
, tint32
, commutative
,
492 "(int64_t)src0 * (int64_t)src1")
493 binop_convert("umul_2x32_64", tuint64
, tuint32
, commutative
,
494 "(uint64_t)src0 * (uint64_t)src1")
496 # high 32-bits of signed integer multiply
497 binop("imul_high", tint
, commutative
, """
498 if (bit_size == 64) {
499 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
500 * extension to work properly. The casts are kind-of annoying but needed
501 * to prevent compiler warnings.
503 uint32_t src0_u32[4] = {
509 uint32_t src1_u32[4] = {
515 uint32_t prod_u32[4];
516 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
517 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
519 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
523 # high 32-bits of unsigned integer multiply
524 binop("umul_high", tuint
, commutative
, """
525 if (bit_size == 64) {
526 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
527 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
528 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
529 uint32_t prod_u32[4];
530 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
531 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
533 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
537 binop("fdiv", tfloat
, "", "src0 / src1")
538 binop("idiv", tint
, "", "src1 == 0 ? 0 : (src0 / src1)")
539 binop("udiv", tuint
, "", "src1 == 0 ? 0 : (src0 / src1)")
541 # returns a boolean representing the carry resulting from the addition of
542 # the two unsigned arguments.
544 binop_convert("uadd_carry", tuint
, tuint
, commutative
, "src0 + src1 < src0")
546 # returns a boolean representing the borrow resulting from the subtraction
547 # of the two unsigned arguments.
549 binop_convert("usub_borrow", tuint
, tuint
, "", "src0 < src1")
551 # hadd: (a + b) >> 1 (without overflow)
552 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
553 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
554 # = 2 * (x & y) + (x & ~y) + (~x & y)
555 # = ((x & y) << 1) + (x ^ y)
557 # Since we know that the bottom bit of (x & y) << 1 is zero,
559 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
560 # = (x & y) + ((x ^ y) >> 1)
561 binop("ihadd", tint
, commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
562 binop("uhadd", tuint
, commutative
, "(src0 & src1) + ((src0 ^ src1) >> 1)")
564 # rhadd: (a + b + 1) >> 1 (without overflow)
565 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
566 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
567 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
568 # = ((x | y) << 1) - (x ^ y) + 1
570 # Since we know that the bottom bit of (x & y) << 1 is zero,
572 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
573 # = (x | y) - ((x ^ y) >> 1)
574 binop("irhadd", tint
, commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
575 binop("urhadd", tuint
, commutative
, "(src0 | src1) + ((src0 ^ src1) >> 1)")
577 binop("umod", tuint
, "", "src1 == 0 ? 0 : src0 % src1")
579 # For signed integers, there are several different possible definitions of
580 # "modulus" or "remainder". We follow the conventions used by LLVM and
581 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
582 # operation while the imod opcode implements the more mathematical
583 # "modulus" operation. For details on the difference, see
585 # http://mathforum.org/library/drmath/view/52343.html
587 binop("irem", tint
, "", "src1 == 0 ? 0 : src0 % src1")
588 binop("imod", tint
, "",
589 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
590 " src0 % src1 : src0 % src1 + src1)")
591 binop("fmod", tfloat
, "", "src0 - src1 * floorf(src0 / src1)")
592 binop("frem", tfloat
, "", "src0 - src1 * truncf(src0 / src1)")
599 # these integer-aware comparisons return a boolean (0 or ~0)
601 binop_compare("flt", tfloat
, "", "src0 < src1")
602 binop_compare("fge", tfloat
, "", "src0 >= src1")
603 binop_compare("feq", tfloat
, commutative
, "src0 == src1")
604 binop_compare("fne", tfloat
, commutative
, "src0 != src1")
605 binop_compare("ilt", tint
, "", "src0 < src1")
606 binop_compare("ige", tint
, "", "src0 >= src1")
607 binop_compare("ieq", tint
, commutative
, "src0 == src1")
608 binop_compare("ine", tint
, commutative
, "src0 != src1")
609 binop_compare("ult", tuint
, "", "src0 < src1")
610 binop_compare("uge", tuint
, "", "src0 >= src1")
611 binop_compare32("flt32", tfloat
, "", "src0 < src1")
612 binop_compare32("fge32", tfloat
, "", "src0 >= src1")
613 binop_compare32("feq32", tfloat
, commutative
, "src0 == src1")
614 binop_compare32("fne32", tfloat
, commutative
, "src0 != src1")
615 binop_compare32("ilt32", tint
, "", "src0 < src1")
616 binop_compare32("ige32", tint
, "", "src0 >= src1")
617 binop_compare32("ieq32", tint
, commutative
, "src0 == src1")
618 binop_compare32("ine32", tint
, commutative
, "src0 != src1")
619 binop_compare32("ult32", tuint
, "", "src0 < src1")
620 binop_compare32("uge32", tuint
, "", "src0 >= src1")
622 # integer-aware GLSL-style comparisons that compare floats and ints
624 binop_reduce("ball_fequal", 1, tbool1
, tfloat
, "{src0} == {src1}",
625 "{src0} && {src1}", "{src}")
626 binop_reduce("bany_fnequal", 1, tbool1
, tfloat
, "{src0} != {src1}",
627 "{src0} || {src1}", "{src}")
628 binop_reduce("ball_iequal", 1, tbool1
, tint
, "{src0} == {src1}",
629 "{src0} && {src1}", "{src}")
630 binop_reduce("bany_inequal", 1, tbool1
, tint
, "{src0} != {src1}",
631 "{src0} || {src1}", "{src}")
633 binop_reduce("b32all_fequal", 1, tbool32
, tfloat
, "{src0} == {src1}",
634 "{src0} && {src1}", "{src}")
635 binop_reduce("b32any_fnequal", 1, tbool32
, tfloat
, "{src0} != {src1}",
636 "{src0} || {src1}", "{src}")
637 binop_reduce("b32all_iequal", 1, tbool32
, tint
, "{src0} == {src1}",
638 "{src0} && {src1}", "{src}")
639 binop_reduce("b32any_inequal", 1, tbool32
, tint
, "{src0} != {src1}",
640 "{src0} || {src1}", "{src}")
642 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
644 binop_reduce("fall_equal", 1, tfloat32
, tfloat32
, "{src0} == {src1}",
645 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
646 binop_reduce("fany_nequal", 1, tfloat32
, tfloat32
, "{src0} != {src1}",
647 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
649 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
650 # and false respectively
652 binop("slt", tfloat32
, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
653 binop("sge", tfloat
, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
654 binop("seq", tfloat32
, commutative
, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
655 binop("sne", tfloat32
, commutative
, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
657 # SPIRV shifts are undefined for shift-operands >= bitsize,
658 # but SM5 shifts are defined to use the least significant bits, only
659 # The NIR definition is according to the SM5 specification.
660 opcode("ishl", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 << (src1 & (sizeof(src0) * 8 - 1))")
661 opcode("ishr", 0, tint
, [0, 0], [tint
, tuint32
], "", "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
662 opcode("ushr", 0, tuint
, [0, 0], [tuint
, tuint32
], "", "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
664 # bitwise logic operators
666 # These are also used as boolean and, or, xor for hardware supporting
670 binop("iand", tuint
, commutative
+ associative
, "src0 & src1")
671 binop("ior", tuint
, commutative
+ associative
, "src0 | src1")
672 binop("ixor", tuint
, commutative
+ associative
, "src0 ^ src1")
675 # floating point logic operators
677 # These use (src != 0.0) for testing the truth of the input, and output 1.0
678 # for true and 0.0 for false
680 binop("fand", tfloat32
, commutative
,
681 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
682 binop("for", tfloat32
, commutative
,
683 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
684 binop("fxor", tfloat32
, commutative
,
685 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
687 binop_reduce("fdot", 1, tfloat
, tfloat
, "{src0} * {src1}", "{src0} + {src1}",
690 binop_reduce("fdot_replicated", 4, tfloat
, tfloat
,
691 "{src0} * {src1}", "{src0} + {src1}", "{src}")
693 opcode("fdph", 1, tfloat
, [3, 4], [tfloat
, tfloat
], "",
694 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
695 opcode("fdph_replicated", 4, tfloat
, [3, 4], [tfloat
, tfloat
], "",
696 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
698 binop("fmin", tfloat
, "", "fminf(src0, src1)")
699 binop("imin", tint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
700 binop("umin", tuint
, commutative
+ associative
, "src1 > src0 ? src0 : src1")
701 binop("fmax", tfloat
, "", "fmaxf(src0, src1)")
702 binop("imax", tint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
703 binop("umax", tuint
, commutative
+ associative
, "src1 > src0 ? src1 : src0")
705 # Saturated vector add for 4 8bit ints.
706 binop("usadd_4x8", tint32
, commutative
+ associative
, """
708 for (int i = 0; i < 32; i += 8) {
709 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
713 # Saturated vector subtract for 4 8bit ints.
714 binop("ussub_4x8", tint32
, "", """
716 for (int i = 0; i < 32; i += 8) {
717 int src0_chan = (src0 >> i) & 0xff;
718 int src1_chan = (src1 >> i) & 0xff;
719 if (src0_chan > src1_chan)
720 dst |= (src0_chan - src1_chan) << i;
724 # vector min for 4 8bit ints.
725 binop("umin_4x8", tint32
, commutative
+ associative
, """
727 for (int i = 0; i < 32; i += 8) {
728 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
732 # vector max for 4 8bit ints.
733 binop("umax_4x8", tint32
, commutative
+ associative
, """
735 for (int i = 0; i < 32; i += 8) {
736 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
740 # unorm multiply: (a * b) / 255.
741 binop("umul_unorm_4x8", tint32
, commutative
+ associative
, """
743 for (int i = 0; i < 32; i += 8) {
744 int src0_chan = (src0 >> i) & 0xff;
745 int src1_chan = (src1 >> i) & 0xff;
746 dst |= ((src0_chan * src1_chan) / 255) << i;
750 binop("fpow", tfloat
, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
752 binop_horiz("pack_half_2x16_split", 1, tuint32
, 1, tfloat32
, 1, tfloat32
,
753 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
755 binop_convert("pack_64_2x32_split", tuint64
, tuint32
, "",
756 "src0 | ((uint64_t)src1 << 32)")
758 binop_convert("pack_32_2x16_split", tuint32
, tuint16
, "",
759 "src0 | ((uint32_t)src1 << 16)")
761 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
762 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
763 # if either of its arguments are 32.
764 binop_convert("bfm", tuint32
, tint32
, "", """
765 int bits = src0, offset = src1;
766 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
767 dst = 0; /* undefined */
769 dst = ((1u << bits) - 1) << offset;
772 opcode("ldexp", 0, tfloat
, [0, 0], [tfloat
, tint32
], "", """
773 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
774 /* flush denormals to zero. */
776 dst = copysignf(0.0f, src0);
779 # Combines the first component of each input to make a 2-component vector.
781 binop_horiz("vec2", 2, tuint
, 1, tuint
, 1, tuint
, """
787 binop("extract_u8", tuint
, "", "(uint8_t)(src0 >> (src1 * 8))")
788 binop("extract_i8", tint
, "", "(int8_t)(src0 >> (src1 * 8))")
791 binop("extract_u16", tuint
, "", "(uint16_t)(src0 >> (src1 * 16))")
792 binop("extract_i16", tint
, "", "(int16_t)(src0 >> (src1 * 16))")
795 def triop(name
, ty
, const_expr
):
796 opcode(name
, 0, ty
, [0, 0, 0], [ty
, ty
, ty
], "", const_expr
)
797 def triop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
, const_expr
):
798 opcode(name
, output_size
, tuint
,
799 [src1_size
, src2_size
, src3_size
],
800 [tuint
, tuint
, tuint
], "", const_expr
)
802 triop("ffma", tfloat
, "src0 * src1 + src2")
804 triop("flrp", tfloat
, "src0 * (1 - src2) + src1 * src2")
808 # A vector conditional select instruction (like ?:, but operating per-
809 # component on vectors). There are two versions, one for floating point
810 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
813 triop("fcsel", tfloat32
, "(src0 != 0.0f) ? src1 : src2")
816 triop("fmin3", tfloat
, "fminf(src0, fminf(src1, src2))")
817 triop("imin3", tint
, "MIN2(src0, MIN2(src1, src2))")
818 triop("umin3", tuint
, "MIN2(src0, MIN2(src1, src2))")
820 triop("fmax3", tfloat
, "fmaxf(src0, fmaxf(src1, src2))")
821 triop("imax3", tint
, "MAX2(src0, MAX2(src1, src2))")
822 triop("umax3", tuint
, "MAX2(src0, MAX2(src1, src2))")
824 triop("fmed3", tfloat
, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
825 triop("imed3", tint
, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
826 triop("umed3", tuint
, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
828 opcode("bcsel", 0, tuint
, [0, 0, 0],
829 [tbool1
, tuint
, tuint
], "", "src0 ? src1 : src2")
830 opcode("b32csel", 0, tuint
, [0, 0, 0],
831 [tbool32
, tuint
, tuint
], "", "src0 ? src1 : src2")
834 triop("bfi", tuint32
, """
835 unsigned mask = src0, insert = src1, base = src2;
844 dst = (base & ~mask) | (insert & mask);
848 # SM5 ubfe/ibfe assembly
849 opcode("ubfe", 0, tuint32
,
850 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
851 unsigned base = src0;
852 int offset = src1, bits = src2;
855 } else if (bits < 0 || offset < 0) {
856 dst = 0; /* undefined */
857 } else if (offset + bits < 32) {
858 dst = (base << (32 - bits - offset)) >> (32 - bits);
860 dst = base >> offset;
863 opcode("ibfe", 0, tint32
,
864 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
866 int offset = src1, bits = src2;
869 } else if (bits < 0 || offset < 0) {
870 dst = 0; /* undefined */
871 } else if (offset + bits < 32) {
872 dst = (base << (32 - bits - offset)) >> (32 - bits);
874 dst = base >> offset;
878 # GLSL bitfieldExtract()
879 opcode("ubitfield_extract", 0, tuint32
,
880 [0, 0, 0], [tuint32
, tint32
, tint32
], "", """
881 unsigned base = src0;
882 int offset = src1, bits = src2;
885 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
886 dst = 0; /* undefined per the spec */
888 dst = (base >> offset) & ((1ull << bits) - 1);
891 opcode("ibitfield_extract", 0, tint32
,
892 [0, 0, 0], [tint32
, tint32
, tint32
], "", """
894 int offset = src1, bits = src2;
897 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
900 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
904 # Combines the first component of each input to make a 3-component vector.
906 triop_horiz("vec3", 3, 1, 1, 1, """
912 def quadop_horiz(name
, output_size
, src1_size
, src2_size
, src3_size
,
913 src4_size
, const_expr
):
914 opcode(name
, output_size
, tuint
,
915 [src1_size
, src2_size
, src3_size
, src4_size
],
916 [tuint
, tuint
, tuint
, tuint
],
919 opcode("bitfield_insert", 0, tuint32
, [0, 0, 0, 0],
920 [tuint32
, tuint32
, tint32
, tint32
], "", """
921 unsigned base = src0, insert = src1;
922 int offset = src2, bits = src3;
925 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
928 unsigned mask = ((1ull << bits) - 1) << offset;
929 dst = (base & ~mask) | ((insert << offset) & mask);
933 quadop_horiz("vec4", 4, 1, 1, 1, 1, """