nir: Mark ffma as 2src_commutative
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool32 = "bool32"
99 tuint = "uint"
100 tuint16 = "uint16"
101 tfloat32 = "float32"
102 tint32 = "int32"
103 tuint32 = "uint32"
104 tint64 = "int64"
105 tuint64 = "uint64"
106 tfloat64 = "float64"
107
108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
109
110 def type_has_size(type_):
111 m = _TYPE_SPLIT_RE.match(type_)
112 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
113 return m.group('bits') is not None
114
115 def type_size(type_):
116 m = _TYPE_SPLIT_RE.match(type_)
117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118 assert m.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_)
120 return int(m.group('bits'))
121
122 def type_sizes(type_):
123 if type_has_size(type_):
124 return [type_size(type_)]
125 elif type_ == 'bool':
126 return [1, 32]
127 elif type_ == 'float':
128 return [16, 32, 64]
129 else:
130 return [1, 8, 16, 32, 64]
131
132 def type_base_type(type_):
133 m = _TYPE_SPLIT_RE.match(type_)
134 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
135 return m.group('type')
136
137 # Operation where the first two sources are commutative.
138 #
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
141 # sources.
142 _2src_commutative = "2src_commutative "
143 associative = "associative "
144
145 # global dictionary of opcodes
146 opcodes = {}
147
148 def opcode(name, output_size, output_type, input_sizes, input_types,
149 is_conversion, algebraic_properties, const_expr):
150 assert name not in opcodes
151 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
152 input_types, is_conversion, algebraic_properties,
153 const_expr)
154
155 def unop_convert(name, out_type, in_type, const_expr):
156 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
157
158 def unop(name, ty, const_expr):
159 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
160
161 def unop_horiz(name, output_size, output_type, input_size, input_type,
162 const_expr):
163 opcode(name, output_size, output_type, [input_size], [input_type],
164 False, "", const_expr)
165
166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
167 reduce_expr, final_expr):
168 def prereduce(src):
169 return "(" + prereduce_expr.format(src=src) + ")"
170 def final(src):
171 return final_expr.format(src="(" + src + ")")
172 def reduce_(src0, src1):
173 return reduce_expr.format(src0=src0, src1=src1)
174 src0 = prereduce("src0.x")
175 src1 = prereduce("src0.y")
176 src2 = prereduce("src0.z")
177 src3 = prereduce("src0.w")
178 unop_horiz(name + "2", output_size, output_type, 2, input_type,
179 final(reduce_(src0, src1)))
180 unop_horiz(name + "3", output_size, output_type, 3, input_type,
181 final(reduce_(reduce_(src0, src1), src2)))
182 unop_horiz(name + "4", output_size, output_type, 4, input_type,
183 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
184
185 def unop_numeric_convert(name, out_type, in_type, const_expr):
186 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
187
188 # These two move instructions differ in what modifiers they support and what
189 # the negate modifier means. Otherwise, they are identical.
190 unop("fmov", tfloat, "src0")
191 unop("imov", tint, "src0")
192
193 unop("ineg", tint, "-src0")
194 unop("fneg", tfloat, "-src0")
195 unop("inot", tint, "~src0") # invert every bit of the integer
196 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
197 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
198 unop("fsign", tfloat, ("bit_size == 64 ? " +
199 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
200 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
201 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
202 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
203 unop("fabs", tfloat, "fabs(src0)")
204 unop("fsat", tfloat, ("bit_size == 64 ? " +
205 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
206 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
207 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
208 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
209 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
210 unop("fexp2", tfloat, "exp2f(src0)")
211 unop("flog2", tfloat, "log2f(src0)")
212
213 # Generate all of the numeric conversion opcodes
214 for src_t in [tint, tuint, tfloat, tbool]:
215 if src_t == tbool:
216 dst_types = [tfloat, tint]
217 elif src_t == tint:
218 dst_types = [tfloat, tint, tbool]
219 elif src_t == tuint:
220 dst_types = [tfloat, tuint]
221 elif src_t == tfloat:
222 dst_types = [tint, tuint, tfloat, tbool]
223
224 for dst_t in dst_types:
225 for bit_size in type_sizes(dst_t):
226 if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
227 rnd_modes = ['_rtne', '_rtz', '']
228 for rnd_mode in rnd_modes:
229 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
230 bit_size, rnd_mode),
231 dst_t + str(bit_size), src_t, "src0")
232 else:
233 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
234 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
235 dst_t + str(bit_size), src_t, conv_expr)
236
237
238 # Unary floating-point rounding operations.
239
240
241 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
242 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
243 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
244 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
245 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
246
247 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
248
249 # Trigonometric operations.
250
251
252 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
253 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
254
255 # dfrexp
256 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
257 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
258
259 # Partial derivatives.
260
261
262 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
263 unop("fddy", tfloat, "0.0")
264 unop("fddx_fine", tfloat, "0.0")
265 unop("fddy_fine", tfloat, "0.0")
266 unop("fddx_coarse", tfloat, "0.0")
267 unop("fddy_coarse", tfloat, "0.0")
268
269
270 # Floating point pack and unpack operations.
271
272 def pack_2x16(fmt):
273 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
274 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
275 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
276 """.replace("fmt", fmt))
277
278 def pack_4x8(fmt):
279 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
280 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
281 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
282 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
283 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
284 """.replace("fmt", fmt))
285
286 def unpack_2x16(fmt):
287 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
288 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
289 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
290 """.replace("fmt", fmt))
291
292 def unpack_4x8(fmt):
293 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
294 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
295 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
296 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
297 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
298 """.replace("fmt", fmt))
299
300
301 pack_2x16("snorm")
302 pack_4x8("snorm")
303 pack_2x16("unorm")
304 pack_4x8("unorm")
305 pack_2x16("half")
306 unpack_2x16("snorm")
307 unpack_4x8("snorm")
308 unpack_2x16("unorm")
309 unpack_4x8("unorm")
310 unpack_2x16("half")
311
312 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
313 dst.x = (src0.x & 0xffff) | (src0.y << 16);
314 """)
315
316 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
317 dst.x = (src0.x << 0) |
318 (src0.y << 8) |
319 (src0.z << 16) |
320 (src0.w << 24);
321 """)
322
323 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
324 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
325
326 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
327 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
328
329 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
330 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
331
332 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
333 "dst.x = src0.x; dst.y = src0.x >> 32;")
334
335 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
336 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
337
338 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
339 "dst.x = src0.x; dst.y = src0.x >> 16;")
340
341 # Lowered floating point unpacking operations.
342
343
344 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
345 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
346 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
347 "unpack_half_1x16((uint16_t)(src0 >> 16))")
348
349 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
350 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
351
352 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
353 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
354
355 # Bit operations, part of ARB_gpu_shader5.
356
357
358 unop("bitfield_reverse", tuint32, """
359 /* we're not winning any awards for speed here, but that's ok */
360 dst = 0;
361 for (unsigned bit = 0; bit < 32; bit++)
362 dst |= ((src0 >> bit) & 1) << (31 - bit);
363 """)
364 unop_convert("bit_count", tuint32, tuint, """
365 dst = 0;
366 for (unsigned bit = 0; bit < bit_size; bit++) {
367 if ((src0 >> bit) & 1)
368 dst++;
369 }
370 """)
371
372 unop_convert("ufind_msb", tint32, tuint, """
373 dst = -1;
374 for (int bit = bit_size - 1; bit >= 0; bit--) {
375 if ((src0 >> bit) & 1) {
376 dst = bit;
377 break;
378 }
379 }
380 """)
381
382 unop("ifind_msb", tint32, """
383 dst = -1;
384 for (int bit = 31; bit >= 0; bit--) {
385 /* If src0 < 0, we're looking for the first 0 bit.
386 * if src0 >= 0, we're looking for the first 1 bit.
387 */
388 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
389 (!((src0 >> bit) & 1) && (src0 < 0))) {
390 dst = bit;
391 break;
392 }
393 }
394 """)
395
396 unop_convert("find_lsb", tint32, tint, """
397 dst = -1;
398 for (unsigned bit = 0; bit < bit_size; bit++) {
399 if ((src0 >> bit) & 1) {
400 dst = bit;
401 break;
402 }
403 }
404 """)
405
406
407 for i in range(1, 5):
408 for j in range(1, 5):
409 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
410
411
412 # AMD_gcn_shader extended instructions
413 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
414 dst.x = dst.y = 0.0;
415 float absX = fabs(src0.x);
416 float absY = fabs(src0.y);
417 float absZ = fabs(src0.z);
418
419 float ma = 0.0;
420 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
421 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
422 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
423
424 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
425 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
426 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
427 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
428 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
429 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
430
431 dst.x = dst.x / ma + 0.5;
432 dst.y = dst.y / ma + 0.5;
433 """)
434
435 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
436 float absX = fabs(src0.x);
437 float absY = fabs(src0.y);
438 float absZ = fabs(src0.z);
439 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
440 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
441 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
442 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
443 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
444 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
445 """)
446
447
448 def binop_convert(name, out_type, in_type, alg_props, const_expr):
449 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
450 False, alg_props, const_expr)
451
452 def binop(name, ty, alg_props, const_expr):
453 binop_convert(name, ty, ty, alg_props, const_expr)
454
455 def binop_compare(name, ty, alg_props, const_expr):
456 binop_convert(name, tbool1, ty, alg_props, const_expr)
457
458 def binop_compare32(name, ty, alg_props, const_expr):
459 binop_convert(name, tbool32, ty, alg_props, const_expr)
460
461 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
462 src2_type, const_expr):
463 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
464 False, "", const_expr)
465
466 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
467 reduce_expr, final_expr):
468 def final(src):
469 return final_expr.format(src= "(" + src + ")")
470 def reduce_(src0, src1):
471 return reduce_expr.format(src0=src0, src1=src1)
472 def prereduce(src0, src1):
473 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
474 src0 = prereduce("src0.x", "src1.x")
475 src1 = prereduce("src0.y", "src1.y")
476 src2 = prereduce("src0.z", "src1.z")
477 src3 = prereduce("src0.w", "src1.w")
478 opcode(name + "2", output_size, output_type,
479 [2, 2], [src_type, src_type], False, _2src_commutative,
480 final(reduce_(src0, src1)))
481 opcode(name + "3", output_size, output_type,
482 [3, 3], [src_type, src_type], False, _2src_commutative,
483 final(reduce_(reduce_(src0, src1), src2)))
484 opcode(name + "4", output_size, output_type,
485 [4, 4], [src_type, src_type], False, _2src_commutative,
486 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
487
488 binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1")
489 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
490 binop("iadd_sat", tint, _2src_commutative, """
491 src1 > 0 ?
492 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
493 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
494 """)
495 binop("uadd_sat", tuint, _2src_commutative,
496 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
497 binop("isub_sat", tint, "", """
498 src1 < 0 ?
499 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
500 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
501 """)
502 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
503
504 binop("fsub", tfloat, "", "src0 - src1")
505 binop("isub", tint, "", "src0 - src1")
506
507 binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1")
508 # low 32-bits of signed/unsigned integer multiply
509 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
510
511 # Generate 64 bit result from 2 32 bits quantity
512 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
513 "(int64_t)src0 * (int64_t)src1")
514 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
515 "(uint64_t)src0 * (uint64_t)src1")
516
517 # high 32-bits of signed integer multiply
518 binop("imul_high", tint, _2src_commutative, """
519 if (bit_size == 64) {
520 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
521 * extension to work properly. The casts are kind-of annoying but needed
522 * to prevent compiler warnings.
523 */
524 uint32_t src0_u32[4] = {
525 src0,
526 (int64_t)src0 >> 32,
527 (int64_t)src0 >> 63,
528 (int64_t)src0 >> 63,
529 };
530 uint32_t src1_u32[4] = {
531 src1,
532 (int64_t)src1 >> 32,
533 (int64_t)src1 >> 63,
534 (int64_t)src1 >> 63,
535 };
536 uint32_t prod_u32[4];
537 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
538 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
539 } else {
540 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
541 }
542 """)
543
544 # high 32-bits of unsigned integer multiply
545 binop("umul_high", tuint, _2src_commutative, """
546 if (bit_size == 64) {
547 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
548 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
549 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
550 uint32_t prod_u32[4];
551 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
552 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
553 } else {
554 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
555 }
556 """)
557
558 binop("fdiv", tfloat, "", "src0 / src1")
559 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
560 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
561
562 # returns a boolean representing the carry resulting from the addition of
563 # the two unsigned arguments.
564
565 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
566
567 # returns a boolean representing the borrow resulting from the subtraction
568 # of the two unsigned arguments.
569
570 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
571
572 # hadd: (a + b) >> 1 (without overflow)
573 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
574 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
575 # = 2 * (x & y) + (x & ~y) + (~x & y)
576 # = ((x & y) << 1) + (x ^ y)
577 #
578 # Since we know that the bottom bit of (x & y) << 1 is zero,
579 #
580 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
581 # = (x & y) + ((x ^ y) >> 1)
582 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
583 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
584
585 # rhadd: (a + b + 1) >> 1 (without overflow)
586 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
587 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
588 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
589 # = ((x | y) << 1) - (x ^ y) + 1
590 #
591 # Since we know that the bottom bit of (x & y) << 1 is zero,
592 #
593 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
594 # = (x | y) - ((x ^ y) >> 1)
595 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
596 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
597
598 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
599
600 # For signed integers, there are several different possible definitions of
601 # "modulus" or "remainder". We follow the conventions used by LLVM and
602 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
603 # operation while the imod opcode implements the more mathematical
604 # "modulus" operation. For details on the difference, see
605 #
606 # http://mathforum.org/library/drmath/view/52343.html
607
608 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
609 binop("imod", tint, "",
610 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
611 " src0 % src1 : src0 % src1 + src1)")
612 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
613 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
614
615 #
616 # Comparisons
617 #
618
619
620 # these integer-aware comparisons return a boolean (0 or ~0)
621
622 binop_compare("flt", tfloat, "", "src0 < src1")
623 binop_compare("fge", tfloat, "", "src0 >= src1")
624 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
625 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
626 binop_compare("ilt", tint, "", "src0 < src1")
627 binop_compare("ige", tint, "", "src0 >= src1")
628 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
629 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
630 binop_compare("ult", tuint, "", "src0 < src1")
631 binop_compare("uge", tuint, "", "src0 >= src1")
632 binop_compare32("flt32", tfloat, "", "src0 < src1")
633 binop_compare32("fge32", tfloat, "", "src0 >= src1")
634 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
635 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
636 binop_compare32("ilt32", tint, "", "src0 < src1")
637 binop_compare32("ige32", tint, "", "src0 >= src1")
638 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
639 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
640 binop_compare32("ult32", tuint, "", "src0 < src1")
641 binop_compare32("uge32", tuint, "", "src0 >= src1")
642
643 # integer-aware GLSL-style comparisons that compare floats and ints
644
645 binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}",
646 "{src0} && {src1}", "{src}")
647 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
648 "{src0} || {src1}", "{src}")
649 binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}",
650 "{src0} && {src1}", "{src}")
651 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
652 "{src0} || {src1}", "{src}")
653
654 binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}",
655 "{src0} && {src1}", "{src}")
656 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
657 "{src0} || {src1}", "{src}")
658 binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}",
659 "{src0} && {src1}", "{src}")
660 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
661 "{src0} || {src1}", "{src}")
662
663 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
664
665 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
666 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
667 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
668 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
669
670 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
671 # and false respectively
672
673 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
674 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
675 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
676 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
677
678 # SPIRV shifts are undefined for shift-operands >= bitsize,
679 # but SM5 shifts are defined to use the least significant bits, only
680 # The NIR definition is according to the SM5 specification.
681 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
682 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
683 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
684 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
685 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
686 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
687
688 # bitwise logic operators
689 #
690 # These are also used as boolean and, or, xor for hardware supporting
691 # integers.
692
693
694 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
695 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
696 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
697
698
699 # floating point logic operators
700 #
701 # These use (src != 0.0) for testing the truth of the input, and output 1.0
702 # for true and 0.0 for false
703
704 binop("fand", tfloat32, _2src_commutative,
705 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
706 binop("for", tfloat32, _2src_commutative,
707 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
708 binop("fxor", tfloat32, _2src_commutative,
709 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
710
711 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
712 "{src}")
713
714 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
715 "{src0} * {src1}", "{src0} + {src1}", "{src}")
716
717 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
718 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
719 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
720 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
721
722 binop("fmin", tfloat, "", "fminf(src0, src1)")
723 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
724 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
725 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
726 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
727 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
728
729 # Saturated vector add for 4 8bit ints.
730 binop("usadd_4x8", tint32, _2src_commutative + associative, """
731 dst = 0;
732 for (int i = 0; i < 32; i += 8) {
733 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
734 }
735 """)
736
737 # Saturated vector subtract for 4 8bit ints.
738 binop("ussub_4x8", tint32, "", """
739 dst = 0;
740 for (int i = 0; i < 32; i += 8) {
741 int src0_chan = (src0 >> i) & 0xff;
742 int src1_chan = (src1 >> i) & 0xff;
743 if (src0_chan > src1_chan)
744 dst |= (src0_chan - src1_chan) << i;
745 }
746 """)
747
748 # vector min for 4 8bit ints.
749 binop("umin_4x8", tint32, _2src_commutative + associative, """
750 dst = 0;
751 for (int i = 0; i < 32; i += 8) {
752 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
753 }
754 """)
755
756 # vector max for 4 8bit ints.
757 binop("umax_4x8", tint32, _2src_commutative + associative, """
758 dst = 0;
759 for (int i = 0; i < 32; i += 8) {
760 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
761 }
762 """)
763
764 # unorm multiply: (a * b) / 255.
765 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
766 dst = 0;
767 for (int i = 0; i < 32; i += 8) {
768 int src0_chan = (src0 >> i) & 0xff;
769 int src1_chan = (src1 >> i) & 0xff;
770 dst |= ((src0_chan * src1_chan) / 255) << i;
771 }
772 """)
773
774 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
775
776 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
777 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
778
779 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
780 "src0 | ((uint64_t)src1 << 32)")
781
782 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
783 "src0 | ((uint32_t)src1 << 16)")
784
785 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
786 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
787 # if either of its arguments are 32.
788 binop_convert("bfm", tuint32, tint32, "", """
789 int bits = src0, offset = src1;
790 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
791 dst = 0; /* undefined */
792 else
793 dst = ((1u << bits) - 1) << offset;
794 """)
795
796 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
797 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
798 /* flush denormals to zero. */
799 if (!isnormal(dst))
800 dst = copysignf(0.0f, src0);
801 """)
802
803 # Combines the first component of each input to make a 2-component vector.
804
805 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
806 dst.x = src0.x;
807 dst.y = src1.x;
808 """)
809
810 # Byte extraction
811 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
812 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
813
814 # Word extraction
815 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
816 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
817
818
819 def triop(name, ty, alg_props, const_expr):
820 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
821 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
822 opcode(name, output_size, tuint,
823 [src1_size, src2_size, src3_size],
824 [tuint, tuint, tuint], False, "", const_expr)
825
826 triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2")
827
828 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
829
830 # Conditional Select
831 #
832 # A vector conditional select instruction (like ?:, but operating per-
833 # component on vectors). There are two versions, one for floating point
834 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
835
836
837 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
838
839 # 3 way min/max/med
840 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
841 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
842 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
843
844 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
845 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
846 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
847
848 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
849 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
850 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
851
852 opcode("bcsel", 0, tuint, [0, 0, 0],
853 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
854 opcode("b32csel", 0, tuint, [0, 0, 0],
855 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
856
857 # SM5 bfi assembly
858 triop("bfi", tuint32, "", """
859 unsigned mask = src0, insert = src1, base = src2;
860 if (mask == 0) {
861 dst = base;
862 } else {
863 unsigned tmp = mask;
864 while (!(tmp & 1)) {
865 tmp >>= 1;
866 insert <<= 1;
867 }
868 dst = (base & ~mask) | (insert & mask);
869 }
870 """)
871
872 # SM5 ubfe/ibfe assembly
873 opcode("ubfe", 0, tuint32,
874 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
875 unsigned base = src0;
876 int offset = src1, bits = src2;
877 if (bits == 0) {
878 dst = 0;
879 } else if (bits < 0 || offset < 0) {
880 dst = 0; /* undefined */
881 } else if (offset + bits < 32) {
882 dst = (base << (32 - bits - offset)) >> (32 - bits);
883 } else {
884 dst = base >> offset;
885 }
886 """)
887 opcode("ibfe", 0, tint32,
888 [0, 0, 0], [tint32, tint32, tint32], False, "", """
889 int base = src0;
890 int offset = src1, bits = src2;
891 if (bits == 0) {
892 dst = 0;
893 } else if (bits < 0 || offset < 0) {
894 dst = 0; /* undefined */
895 } else if (offset + bits < 32) {
896 dst = (base << (32 - bits - offset)) >> (32 - bits);
897 } else {
898 dst = base >> offset;
899 }
900 """)
901
902 # GLSL bitfieldExtract()
903 opcode("ubitfield_extract", 0, tuint32,
904 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
905 unsigned base = src0;
906 int offset = src1, bits = src2;
907 if (bits == 0) {
908 dst = 0;
909 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
910 dst = 0; /* undefined per the spec */
911 } else {
912 dst = (base >> offset) & ((1ull << bits) - 1);
913 }
914 """)
915 opcode("ibitfield_extract", 0, tint32,
916 [0, 0, 0], [tint32, tint32, tint32], False, "", """
917 int base = src0;
918 int offset = src1, bits = src2;
919 if (bits == 0) {
920 dst = 0;
921 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
922 dst = 0;
923 } else {
924 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
925 }
926 """)
927
928 # Combines the first component of each input to make a 3-component vector.
929
930 triop_horiz("vec3", 3, 1, 1, 1, """
931 dst.x = src0.x;
932 dst.y = src1.x;
933 dst.z = src2.x;
934 """)
935
936 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
937 src4_size, const_expr):
938 opcode(name, output_size, tuint,
939 [src1_size, src2_size, src3_size, src4_size],
940 [tuint, tuint, tuint, tuint],
941 False, "", const_expr)
942
943 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
944 [tuint32, tuint32, tint32, tint32], False, "", """
945 unsigned base = src0, insert = src1;
946 int offset = src2, bits = src3;
947 if (bits == 0) {
948 dst = base;
949 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
950 dst = 0;
951 } else {
952 unsigned mask = ((1ull << bits) - 1) << offset;
953 dst = (base & ~mask) | ((insert << offset) & mask);
954 }
955 """)
956
957 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
958 dst.x = src0.x;
959 dst.y = src1.x;
960 dst.z = src2.x;
961 dst.w = src3.x;
962 """)
963
964