nir: Add a saturated unsigned integer add opcode
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
46
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
56
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
65 """
66 assert isinstance(name, str)
67 assert isinstance(output_size, int)
68 assert isinstance(output_type, str)
69 assert isinstance(input_sizes, list)
70 assert isinstance(input_sizes[0], int)
71 assert isinstance(input_types, list)
72 assert isinstance(input_types[0], str)
73 assert isinstance(algebraic_properties, str)
74 assert isinstance(const_expr, str)
75 assert len(input_sizes) == len(input_types)
76 assert 0 <= output_size <= 4
77 for size in input_sizes:
78 assert 0 <= size <= 4
79 if output_size != 0:
80 assert size != 0
81 self.name = name
82 self.num_inputs = len(input_sizes)
83 self.output_size = output_size
84 self.output_type = output_type
85 self.input_sizes = input_sizes
86 self.input_types = input_types
87 self.algebraic_properties = algebraic_properties
88 self.const_expr = const_expr
89
90 # helper variables for strings
91 tfloat = "float"
92 tint = "int"
93 tbool = "bool"
94 tbool32 = "bool32"
95 tuint = "uint"
96 tuint16 = "uint16"
97 tfloat32 = "float32"
98 tint32 = "int32"
99 tuint32 = "uint32"
100 tint64 = "int64"
101 tuint64 = "uint64"
102 tfloat64 = "float64"
103
104 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
105
106 def type_has_size(type_):
107 m = _TYPE_SPLIT_RE.match(type_)
108 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
109 return m.group('bits') is not None
110
111 def type_size(type_):
112 m = _TYPE_SPLIT_RE.match(type_)
113 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
114 assert m.group('bits') is not None, \
115 'NIR type string has no bit size: "{}"'.format(type_)
116 return int(m.group('bits'))
117
118 def type_sizes(type_):
119 if type_has_size(type_):
120 return [type_size(type_)]
121 elif type_ == 'bool':
122 return [32]
123 elif type_ == 'float':
124 return [16, 32, 64]
125 else:
126 return [8, 16, 32, 64]
127
128 def type_base_type(type_):
129 m = _TYPE_SPLIT_RE.match(type_)
130 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
131 return m.group('type')
132
133 commutative = "commutative "
134 associative = "associative "
135
136 # global dictionary of opcodes
137 opcodes = {}
138
139 def opcode(name, output_size, output_type, input_sizes, input_types,
140 algebraic_properties, const_expr):
141 assert name not in opcodes
142 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
143 input_types, algebraic_properties, const_expr)
144
145 def unop_convert(name, out_type, in_type, const_expr):
146 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
147
148 def unop(name, ty, const_expr):
149 opcode(name, 0, ty, [0], [ty], "", const_expr)
150
151 def unop_horiz(name, output_size, output_type, input_size, input_type,
152 const_expr):
153 opcode(name, output_size, output_type, [input_size], [input_type], "",
154 const_expr)
155
156 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
157 reduce_expr, final_expr):
158 def prereduce(src):
159 return "(" + prereduce_expr.format(src=src) + ")"
160 def final(src):
161 return final_expr.format(src="(" + src + ")")
162 def reduce_(src0, src1):
163 return reduce_expr.format(src0=src0, src1=src1)
164 src0 = prereduce("src0.x")
165 src1 = prereduce("src0.y")
166 src2 = prereduce("src0.z")
167 src3 = prereduce("src0.w")
168 unop_horiz(name + "2", output_size, output_type, 2, input_type,
169 final(reduce_(src0, src1)))
170 unop_horiz(name + "3", output_size, output_type, 3, input_type,
171 final(reduce_(reduce_(src0, src1), src2)))
172 unop_horiz(name + "4", output_size, output_type, 4, input_type,
173 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
174
175
176 # These two move instructions differ in what modifiers they support and what
177 # the negate modifier means. Otherwise, they are identical.
178 unop("fmov", tfloat, "src0")
179 unop("imov", tint, "src0")
180
181 unop("ineg", tint, "-src0")
182 unop("fneg", tfloat, "-src0")
183 unop("inot", tint, "~src0") # invert every bit of the integer
184 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
185 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
186 unop("fsign", tfloat, ("bit_size == 64 ? " +
187 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
188 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
189 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
190 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
191 unop("fabs", tfloat, "fabs(src0)")
192 unop("fsat", tfloat, ("bit_size == 64 ? " +
193 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
194 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
195 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
196 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
197 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
198 unop("fexp2", tfloat, "exp2f(src0)")
199 unop("flog2", tfloat, "log2f(src0)")
200
201 # Generate all of the numeric conversion opcodes
202 for src_t in [tint, tuint, tfloat, tbool]:
203 if src_t == tbool:
204 dst_types = [tfloat, tint]
205 elif src_t == tint:
206 dst_types = [tfloat, tint, tbool]
207 elif src_t == tuint:
208 dst_types = [tfloat, tuint]
209 elif src_t == tfloat:
210 dst_types = [tint, tuint, tfloat, tbool]
211
212 for dst_t in dst_types:
213 for bit_size in type_sizes(dst_t):
214 if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
215 rnd_modes = ['_rtne', '_rtz', '']
216 for rnd_mode in rnd_modes:
217 unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
218 bit_size, rnd_mode),
219 dst_t + str(bit_size), src_t, "src0")
220 else:
221 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
222 unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
223 dst_t + str(bit_size), src_t, conv_expr)
224
225
226 # Unary floating-point rounding operations.
227
228
229 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
230 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
231 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
232 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
233 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
234
235 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
236
237 # Trigonometric operations.
238
239
240 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
241 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
242
243 # dfrexp
244 unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);")
245 unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);")
246
247 # Partial derivatives.
248
249
250 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
251 unop("fddy", tfloat, "0.0")
252 unop("fddx_fine", tfloat, "0.0")
253 unop("fddy_fine", tfloat, "0.0")
254 unop("fddx_coarse", tfloat, "0.0")
255 unop("fddy_coarse", tfloat, "0.0")
256
257
258 # Floating point pack and unpack operations.
259
260 def pack_2x16(fmt):
261 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
262 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
263 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
264 """.replace("fmt", fmt))
265
266 def pack_4x8(fmt):
267 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
268 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
269 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
270 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
271 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
272 """.replace("fmt", fmt))
273
274 def unpack_2x16(fmt):
275 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
276 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
277 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
278 """.replace("fmt", fmt))
279
280 def unpack_4x8(fmt):
281 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
282 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
283 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
284 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
285 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
286 """.replace("fmt", fmt))
287
288
289 pack_2x16("snorm")
290 pack_4x8("snorm")
291 pack_2x16("unorm")
292 pack_4x8("unorm")
293 pack_2x16("half")
294 unpack_2x16("snorm")
295 unpack_4x8("snorm")
296 unpack_2x16("unorm")
297 unpack_4x8("unorm")
298 unpack_2x16("half")
299
300 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
301 dst.x = (src0.x & 0xffff) | (src0.y << 16);
302 """)
303
304 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
305 dst.x = (src0.x << 0) |
306 (src0.y << 8) |
307 (src0.z << 16) |
308 (src0.w << 24);
309 """)
310
311 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
312 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
313
314 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
315 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
316
317 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
318 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
319
320 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
321 "dst.x = src0.x; dst.y = src0.x >> 32;")
322
323 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
324 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
325
326 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
327 "dst.x = src0.x; dst.y = src0.x >> 16;")
328
329 # Lowered floating point unpacking operations.
330
331
332 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
333 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
334 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
335 "unpack_half_1x16((uint16_t)(src0 >> 16))")
336
337 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
338 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
339
340 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
341 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
342
343 # Bit operations, part of ARB_gpu_shader5.
344
345
346 unop("bitfield_reverse", tuint32, """
347 /* we're not winning any awards for speed here, but that's ok */
348 dst = 0;
349 for (unsigned bit = 0; bit < 32; bit++)
350 dst |= ((src0 >> bit) & 1) << (31 - bit);
351 """)
352 unop_convert("bit_count", tuint32, tuint, """
353 dst = 0;
354 for (unsigned bit = 0; bit < bit_size; bit++) {
355 if ((src0 >> bit) & 1)
356 dst++;
357 }
358 """)
359
360 unop_convert("ufind_msb", tint32, tuint, """
361 dst = -1;
362 for (int bit = bit_size - 1; bit >= 0; bit--) {
363 if ((src0 >> bit) & 1) {
364 dst = bit;
365 break;
366 }
367 }
368 """)
369
370 unop("ifind_msb", tint32, """
371 dst = -1;
372 for (int bit = 31; bit >= 0; bit--) {
373 /* If src0 < 0, we're looking for the first 0 bit.
374 * if src0 >= 0, we're looking for the first 1 bit.
375 */
376 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
377 (!((src0 >> bit) & 1) && (src0 < 0))) {
378 dst = bit;
379 break;
380 }
381 }
382 """)
383
384 unop_convert("find_lsb", tint32, tint, """
385 dst = -1;
386 for (unsigned bit = 0; bit < bit_size; bit++) {
387 if ((src0 >> bit) & 1) {
388 dst = bit;
389 break;
390 }
391 }
392 """)
393
394
395 for i in range(1, 5):
396 for j in range(1, 5):
397 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
398
399
400 # AMD_gcn_shader extended instructions
401 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
402 dst.x = dst.y = 0.0;
403 float absX = fabs(src0.x);
404 float absY = fabs(src0.y);
405 float absZ = fabs(src0.z);
406 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
407 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
408 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
409 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
410 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
411 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
412 """)
413
414 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
415 float absX = fabs(src0.x);
416 float absY = fabs(src0.y);
417 float absZ = fabs(src0.z);
418 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
419 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
420 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
421 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
422 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
423 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
424 """)
425
426
427 def binop_convert(name, out_type, in_type, alg_props, const_expr):
428 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
429
430 def binop(name, ty, alg_props, const_expr):
431 binop_convert(name, ty, ty, alg_props, const_expr)
432
433 def binop_compare(name, ty, alg_props, const_expr):
434 binop_convert(name, tbool32, ty, alg_props, const_expr)
435
436 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
437 src2_type, const_expr):
438 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
439 "", const_expr)
440
441 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
442 reduce_expr, final_expr):
443 def final(src):
444 return final_expr.format(src= "(" + src + ")")
445 def reduce_(src0, src1):
446 return reduce_expr.format(src0=src0, src1=src1)
447 def prereduce(src0, src1):
448 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
449 src0 = prereduce("src0.x", "src1.x")
450 src1 = prereduce("src0.y", "src1.y")
451 src2 = prereduce("src0.z", "src1.z")
452 src3 = prereduce("src0.w", "src1.w")
453 opcode(name + "2", output_size, output_type,
454 [2, 2], [src_type, src_type], commutative,
455 final(reduce_(src0, src1)))
456 opcode(name + "3", output_size, output_type,
457 [3, 3], [src_type, src_type], commutative,
458 final(reduce_(reduce_(src0, src1), src2)))
459 opcode(name + "4", output_size, output_type,
460 [4, 4], [src_type, src_type], commutative,
461 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
462
463 binop("fadd", tfloat, commutative + associative, "src0 + src1")
464 binop("iadd", tint, commutative + associative, "src0 + src1")
465 binop("uadd_sat", tuint, commutative,
466 "(src0 + src1) < src0 ? UINT64_MAX : (src0 + src1)")
467 binop("fsub", tfloat, "", "src0 - src1")
468 binop("isub", tint, "", "src0 - src1")
469
470 binop("fmul", tfloat, commutative + associative, "src0 * src1")
471 # low 32-bits of signed/unsigned integer multiply
472 binop("imul", tint, commutative + associative, "src0 * src1")
473
474 # high 32-bits of signed integer multiply
475 binop("imul_high", tint, commutative, """
476 if (bit_size == 64) {
477 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
478 * extension to work properly. The casts are kind-of annoying but needed
479 * to prevent compiler warnings.
480 */
481 uint32_t src0_u32[4] = {
482 src0,
483 (int64_t)src0 >> 32,
484 (int64_t)src0 >> 63,
485 (int64_t)src0 >> 63,
486 };
487 uint32_t src1_u32[4] = {
488 src1,
489 (int64_t)src1 >> 32,
490 (int64_t)src1 >> 63,
491 (int64_t)src1 >> 63,
492 };
493 uint32_t prod_u32[4];
494 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
495 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
496 } else {
497 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
498 }
499 """)
500
501 # high 32-bits of unsigned integer multiply
502 binop("umul_high", tuint, commutative, """
503 if (bit_size == 64) {
504 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
505 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
506 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
507 uint32_t prod_u32[4];
508 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
509 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
510 } else {
511 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
512 }
513 """)
514
515 binop("fdiv", tfloat, "", "src0 / src1")
516 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
517 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
518
519 # returns a boolean representing the carry resulting from the addition of
520 # the two unsigned arguments.
521
522 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
523
524 # returns a boolean representing the borrow resulting from the subtraction
525 # of the two unsigned arguments.
526
527 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
528
529 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
530
531 # For signed integers, there are several different possible definitions of
532 # "modulus" or "remainder". We follow the conventions used by LLVM and
533 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
534 # operation while the imod opcode implements the more mathematical
535 # "modulus" operation. For details on the difference, see
536 #
537 # http://mathforum.org/library/drmath/view/52343.html
538
539 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
540 binop("imod", tint, "",
541 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
542 " src0 % src1 : src0 % src1 + src1)")
543 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
544 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
545
546 #
547 # Comparisons
548 #
549
550
551 # these integer-aware comparisons return a boolean (0 or ~0)
552
553 binop_compare("flt", tfloat, "", "src0 < src1")
554 binop_compare("fge", tfloat, "", "src0 >= src1")
555 binop_compare("feq", tfloat, commutative, "src0 == src1")
556 binop_compare("fne", tfloat, commutative, "src0 != src1")
557 binop_compare("ilt", tint, "", "src0 < src1")
558 binop_compare("ige", tint, "", "src0 >= src1")
559 binop_compare("ieq", tint, commutative, "src0 == src1")
560 binop_compare("ine", tint, commutative, "src0 != src1")
561 binop_compare("ult", tuint, "", "src0 < src1")
562 binop_compare("uge", tuint, "", "src0 >= src1")
563
564 # integer-aware GLSL-style comparisons that compare floats and ints
565
566 binop_reduce("ball_fequal", 1, tbool32, tfloat, "{src0} == {src1}",
567 "{src0} && {src1}", "{src}")
568 binop_reduce("bany_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
569 "{src0} || {src1}", "{src}")
570 binop_reduce("ball_iequal", 1, tbool32, tint, "{src0} == {src1}",
571 "{src0} && {src1}", "{src}")
572 binop_reduce("bany_inequal", 1, tbool32, tint, "{src0} != {src1}",
573 "{src0} || {src1}", "{src}")
574
575 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
576
577 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
578 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
579 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
580 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
581
582 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
583 # and false respectively
584
585 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
586 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
587 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
588 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
589
590
591 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
592 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
593 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
594
595 # bitwise logic operators
596 #
597 # These are also used as boolean and, or, xor for hardware supporting
598 # integers.
599
600
601 binop("iand", tuint, commutative + associative, "src0 & src1")
602 binop("ior", tuint, commutative + associative, "src0 | src1")
603 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
604
605
606 # floating point logic operators
607 #
608 # These use (src != 0.0) for testing the truth of the input, and output 1.0
609 # for true and 0.0 for false
610
611 binop("fand", tfloat32, commutative,
612 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
613 binop("for", tfloat32, commutative,
614 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
615 binop("fxor", tfloat32, commutative,
616 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
617
618 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
619 "{src}")
620
621 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
622 "{src0} * {src1}", "{src0} + {src1}", "{src}")
623
624 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
625 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
626 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
627 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
628
629 binop("fmin", tfloat, "", "fminf(src0, src1)")
630 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
631 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
632 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
633 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
634 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
635
636 # Saturated vector add for 4 8bit ints.
637 binop("usadd_4x8", tint32, commutative + associative, """
638 dst = 0;
639 for (int i = 0; i < 32; i += 8) {
640 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
641 }
642 """)
643
644 # Saturated vector subtract for 4 8bit ints.
645 binop("ussub_4x8", tint32, "", """
646 dst = 0;
647 for (int i = 0; i < 32; i += 8) {
648 int src0_chan = (src0 >> i) & 0xff;
649 int src1_chan = (src1 >> i) & 0xff;
650 if (src0_chan > src1_chan)
651 dst |= (src0_chan - src1_chan) << i;
652 }
653 """)
654
655 # vector min for 4 8bit ints.
656 binop("umin_4x8", tint32, commutative + associative, """
657 dst = 0;
658 for (int i = 0; i < 32; i += 8) {
659 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
660 }
661 """)
662
663 # vector max for 4 8bit ints.
664 binop("umax_4x8", tint32, commutative + associative, """
665 dst = 0;
666 for (int i = 0; i < 32; i += 8) {
667 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
668 }
669 """)
670
671 # unorm multiply: (a * b) / 255.
672 binop("umul_unorm_4x8", tint32, commutative + associative, """
673 dst = 0;
674 for (int i = 0; i < 32; i += 8) {
675 int src0_chan = (src0 >> i) & 0xff;
676 int src1_chan = (src1 >> i) & 0xff;
677 dst |= ((src0_chan * src1_chan) / 255) << i;
678 }
679 """)
680
681 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
682
683 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
684 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
685
686 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
687 "src0 | ((uint64_t)src1 << 32)")
688
689 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
690 "src0 | ((uint32_t)src1 << 16)")
691
692 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
693 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
694 # if either of its arguments are 32.
695 binop_convert("bfm", tuint32, tint32, "", """
696 int bits = src0, offset = src1;
697 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
698 dst = 0; /* undefined */
699 else
700 dst = ((1u << bits) - 1) << offset;
701 """)
702
703 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
704 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
705 /* flush denormals to zero. */
706 if (!isnormal(dst))
707 dst = copysignf(0.0f, src0);
708 """)
709
710 # Combines the first component of each input to make a 2-component vector.
711
712 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
713 dst.x = src0.x;
714 dst.y = src1.x;
715 """)
716
717 # Byte extraction
718 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
719 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
720
721 # Word extraction
722 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
723 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
724
725
726 def triop(name, ty, const_expr):
727 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
728 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
729 opcode(name, output_size, tuint,
730 [src1_size, src2_size, src3_size],
731 [tuint, tuint, tuint], "", const_expr)
732
733 triop("ffma", tfloat, "src0 * src1 + src2")
734
735 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
736
737 # Conditional Select
738 #
739 # A vector conditional select instruction (like ?:, but operating per-
740 # component on vectors). There are two versions, one for floating point
741 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
742
743
744 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
745
746 # 3 way min/max/med
747 triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
748 triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
749 triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")
750
751 triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
752 triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
753 triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")
754
755 triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
756 triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
757 triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
758
759 opcode("bcsel", 0, tuint, [0, 0, 0],
760 [tbool32, tuint, tuint], "", "src0 ? src1 : src2")
761
762 # SM5 bfi assembly
763 triop("bfi", tuint32, """
764 unsigned mask = src0, insert = src1, base = src2;
765 if (mask == 0) {
766 dst = base;
767 } else {
768 unsigned tmp = mask;
769 while (!(tmp & 1)) {
770 tmp >>= 1;
771 insert <<= 1;
772 }
773 dst = (base & ~mask) | (insert & mask);
774 }
775 """)
776
777 # SM5 ubfe/ibfe assembly
778 opcode("ubfe", 0, tuint32,
779 [0, 0, 0], [tuint32, tint32, tint32], "", """
780 unsigned base = src0;
781 int offset = src1, bits = src2;
782 if (bits == 0) {
783 dst = 0;
784 } else if (bits < 0 || offset < 0) {
785 dst = 0; /* undefined */
786 } else if (offset + bits < 32) {
787 dst = (base << (32 - bits - offset)) >> (32 - bits);
788 } else {
789 dst = base >> offset;
790 }
791 """)
792 opcode("ibfe", 0, tint32,
793 [0, 0, 0], [tint32, tint32, tint32], "", """
794 int base = src0;
795 int offset = src1, bits = src2;
796 if (bits == 0) {
797 dst = 0;
798 } else if (bits < 0 || offset < 0) {
799 dst = 0; /* undefined */
800 } else if (offset + bits < 32) {
801 dst = (base << (32 - bits - offset)) >> (32 - bits);
802 } else {
803 dst = base >> offset;
804 }
805 """)
806
807 # GLSL bitfieldExtract()
808 opcode("ubitfield_extract", 0, tuint32,
809 [0, 0, 0], [tuint32, tint32, tint32], "", """
810 unsigned base = src0;
811 int offset = src1, bits = src2;
812 if (bits == 0) {
813 dst = 0;
814 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
815 dst = 0; /* undefined per the spec */
816 } else {
817 dst = (base >> offset) & ((1ull << bits) - 1);
818 }
819 """)
820 opcode("ibitfield_extract", 0, tint32,
821 [0, 0, 0], [tint32, tint32, tint32], "", """
822 int base = src0;
823 int offset = src1, bits = src2;
824 if (bits == 0) {
825 dst = 0;
826 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
827 dst = 0;
828 } else {
829 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
830 }
831 """)
832
833 # Combines the first component of each input to make a 3-component vector.
834
835 triop_horiz("vec3", 3, 1, 1, 1, """
836 dst.x = src0.x;
837 dst.y = src1.x;
838 dst.z = src2.x;
839 """)
840
841 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
842 src4_size, const_expr):
843 opcode(name, output_size, tuint,
844 [src1_size, src2_size, src3_size, src4_size],
845 [tuint, tuint, tuint, tuint],
846 "", const_expr)
847
848 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
849 [tuint32, tuint32, tint32, tint32], "", """
850 unsigned base = src0, insert = src1;
851 int offset = src2, bits = src3;
852 if (bits == 0) {
853 dst = base;
854 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
855 dst = 0;
856 } else {
857 unsigned mask = ((1ull << bits) - 1) << offset;
858 dst = (base & ~mask) | ((insert << offset) & mask);
859 }
860 """)
861
862 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
863 dst.x = src0.x;
864 dst.y = src1.x;
865 dst.z = src2.x;
866 dst.w = src3.x;
867 """)
868
869