nir: add support for round to zero rounding mode to nir_op_f2f32
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool32 = "bool32"
99 tuint = "uint"
100 tuint16 = "uint16"
101 tfloat32 = "float32"
102 tint32 = "int32"
103 tuint32 = "uint32"
104 tint64 = "int64"
105 tuint64 = "uint64"
106 tfloat64 = "float64"
107
108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
109
110 def type_has_size(type_):
111 m = _TYPE_SPLIT_RE.match(type_)
112 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
113 return m.group('bits') is not None
114
115 def type_size(type_):
116 m = _TYPE_SPLIT_RE.match(type_)
117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118 assert m.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_)
120 return int(m.group('bits'))
121
122 def type_sizes(type_):
123 if type_has_size(type_):
124 return [type_size(type_)]
125 elif type_ == 'bool':
126 return [1, 32]
127 elif type_ == 'float':
128 return [16, 32, 64]
129 else:
130 return [1, 8, 16, 32, 64]
131
132 def type_base_type(type_):
133 m = _TYPE_SPLIT_RE.match(type_)
134 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
135 return m.group('type')
136
137 # Operation where the first two sources are commutative.
138 #
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
141 # sources.
142 _2src_commutative = "2src_commutative "
143 associative = "associative "
144
145 # global dictionary of opcodes
146 opcodes = {}
147
148 def opcode(name, output_size, output_type, input_sizes, input_types,
149 is_conversion, algebraic_properties, const_expr):
150 assert name not in opcodes
151 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
152 input_types, is_conversion, algebraic_properties,
153 const_expr)
154
155 def unop_convert(name, out_type, in_type, const_expr):
156 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
157
158 def unop(name, ty, const_expr):
159 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
160
161 def unop_horiz(name, output_size, output_type, input_size, input_type,
162 const_expr):
163 opcode(name, output_size, output_type, [input_size], [input_type],
164 False, "", const_expr)
165
166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
167 reduce_expr, final_expr):
168 def prereduce(src):
169 return "(" + prereduce_expr.format(src=src) + ")"
170 def final(src):
171 return final_expr.format(src="(" + src + ")")
172 def reduce_(src0, src1):
173 return reduce_expr.format(src0=src0, src1=src1)
174 src0 = prereduce("src0.x")
175 src1 = prereduce("src0.y")
176 src2 = prereduce("src0.z")
177 src3 = prereduce("src0.w")
178 unop_horiz(name + "2", output_size, output_type, 2, input_type,
179 final(reduce_(src0, src1)))
180 unop_horiz(name + "3", output_size, output_type, 3, input_type,
181 final(reduce_(reduce_(src0, src1), src2)))
182 unop_horiz(name + "4", output_size, output_type, 4, input_type,
183 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
184
185 def unop_numeric_convert(name, out_type, in_type, const_expr):
186 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
187
188 unop("mov", tuint, "src0")
189
190 unop("ineg", tint, "-src0")
191 unop("fneg", tfloat, "-src0")
192 unop("inot", tint, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat, "fabs(src0)")
199 unop("fsat", tfloat, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat, "exp2f(src0)")
206 unop("flog2", tfloat, "log2f(src0)")
207
208 # Generate all of the numeric conversion opcodes
209 for src_t in [tint, tuint, tfloat, tbool]:
210 if src_t == tbool:
211 dst_types = [tfloat, tint]
212 elif src_t == tint:
213 dst_types = [tfloat, tint, tbool]
214 elif src_t == tuint:
215 dst_types = [tfloat, tuint]
216 elif src_t == tfloat:
217 dst_types = [tint, tuint, tfloat, tbool]
218
219 for dst_t in dst_types:
220 for bit_size in type_sizes(dst_t):
221 if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
222 rnd_modes = ['_rtne', '_rtz', '']
223 for rnd_mode in rnd_modes:
224 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
225 bit_size, rnd_mode),
226 dst_t + str(bit_size), src_t, "src0")
227 elif bit_size == 32 and dst_t == tfloat and src_t == tfloat:
228 conv_expr = """
229 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
230 dst = _mesa_double_to_float_rtz(src0);
231 } else {
232 dst = src0;
233 }
234 """
235 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
236 dst_t + str(bit_size), src_t, conv_expr)
237 else:
238 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
239 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
240 dst_t + str(bit_size), src_t, conv_expr)
241
242
243 # Unary floating-point rounding operations.
244
245
246 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
247 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
248 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
249 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
250 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
251
252 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
253
254 # Trigonometric operations.
255
256
257 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
258 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
259
260 # dfrexp
261 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
262 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
263
264 # Partial derivatives.
265
266
267 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
268 unop("fddy", tfloat, "0.0")
269 unop("fddx_fine", tfloat, "0.0")
270 unop("fddy_fine", tfloat, "0.0")
271 unop("fddx_coarse", tfloat, "0.0")
272 unop("fddy_coarse", tfloat, "0.0")
273
274
275 # Floating point pack and unpack operations.
276
277 def pack_2x16(fmt):
278 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
279 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
280 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
281 """.replace("fmt", fmt))
282
283 def pack_4x8(fmt):
284 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
285 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
286 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
287 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
288 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
289 """.replace("fmt", fmt))
290
291 def unpack_2x16(fmt):
292 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
293 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
294 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
295 """.replace("fmt", fmt))
296
297 def unpack_4x8(fmt):
298 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
299 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
300 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
301 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
302 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
303 """.replace("fmt", fmt))
304
305
306 pack_2x16("snorm")
307 pack_4x8("snorm")
308 pack_2x16("unorm")
309 pack_4x8("unorm")
310 pack_2x16("half")
311 unpack_2x16("snorm")
312 unpack_4x8("snorm")
313 unpack_2x16("unorm")
314 unpack_4x8("unorm")
315 unpack_2x16("half")
316
317 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
318 dst.x = (src0.x & 0xffff) | (src0.y << 16);
319 """)
320
321 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
322 dst.x = (src0.x << 0) |
323 (src0.y << 8) |
324 (src0.z << 16) |
325 (src0.w << 24);
326 """)
327
328 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
329 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
330
331 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
332 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
333
334 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
335 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
336
337 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
338 "dst.x = src0.x; dst.y = src0.x >> 32;")
339
340 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
341 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
342
343 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
344 "dst.x = src0.x; dst.y = src0.x >> 16;")
345
346 # Lowered floating point unpacking operations.
347
348
349 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
350 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
351 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
352 "unpack_half_1x16((uint16_t)(src0 >> 16))")
353
354 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
355 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
356
357 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
358 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
359
360 # Bit operations, part of ARB_gpu_shader5.
361
362
363 unop("bitfield_reverse", tuint32, """
364 /* we're not winning any awards for speed here, but that's ok */
365 dst = 0;
366 for (unsigned bit = 0; bit < 32; bit++)
367 dst |= ((src0 >> bit) & 1) << (31 - bit);
368 """)
369 unop_convert("bit_count", tuint32, tuint, """
370 dst = 0;
371 for (unsigned bit = 0; bit < bit_size; bit++) {
372 if ((src0 >> bit) & 1)
373 dst++;
374 }
375 """)
376
377 unop_convert("ufind_msb", tint32, tuint, """
378 dst = -1;
379 for (int bit = bit_size - 1; bit >= 0; bit--) {
380 if ((src0 >> bit) & 1) {
381 dst = bit;
382 break;
383 }
384 }
385 """)
386
387 unop("ifind_msb", tint32, """
388 dst = -1;
389 for (int bit = 31; bit >= 0; bit--) {
390 /* If src0 < 0, we're looking for the first 0 bit.
391 * if src0 >= 0, we're looking for the first 1 bit.
392 */
393 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
394 (!((src0 >> bit) & 1) && (src0 < 0))) {
395 dst = bit;
396 break;
397 }
398 }
399 """)
400
401 unop_convert("find_lsb", tint32, tint, """
402 dst = -1;
403 for (unsigned bit = 0; bit < bit_size; bit++) {
404 if ((src0 >> bit) & 1) {
405 dst = bit;
406 break;
407 }
408 }
409 """)
410
411
412 for i in range(1, 5):
413 for j in range(1, 5):
414 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
415
416
417 # AMD_gcn_shader extended instructions
418 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
419 dst.x = dst.y = 0.0;
420 float absX = fabs(src0.x);
421 float absY = fabs(src0.y);
422 float absZ = fabs(src0.z);
423
424 float ma = 0.0;
425 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
426 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
427 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
428
429 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
430 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
431 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
432 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
433 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
434 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
435
436 dst.x = dst.x / ma + 0.5;
437 dst.y = dst.y / ma + 0.5;
438 """)
439
440 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
441 float absX = fabs(src0.x);
442 float absY = fabs(src0.y);
443 float absZ = fabs(src0.z);
444 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
445 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
446 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
447 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
448 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
449 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
450 """)
451
452 # Sum of vector components
453 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
454
455 def binop_convert(name, out_type, in_type, alg_props, const_expr):
456 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
457 False, alg_props, const_expr)
458
459 def binop(name, ty, alg_props, const_expr):
460 binop_convert(name, ty, ty, alg_props, const_expr)
461
462 def binop_compare(name, ty, alg_props, const_expr):
463 binop_convert(name, tbool1, ty, alg_props, const_expr)
464
465 def binop_compare32(name, ty, alg_props, const_expr):
466 binop_convert(name, tbool32, ty, alg_props, const_expr)
467
468 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
469 src2_type, const_expr):
470 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
471 False, "", const_expr)
472
473 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
474 reduce_expr, final_expr):
475 def final(src):
476 return final_expr.format(src= "(" + src + ")")
477 def reduce_(src0, src1):
478 return reduce_expr.format(src0=src0, src1=src1)
479 def prereduce(src0, src1):
480 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
481 src0 = prereduce("src0.x", "src1.x")
482 src1 = prereduce("src0.y", "src1.y")
483 src2 = prereduce("src0.z", "src1.z")
484 src3 = prereduce("src0.w", "src1.w")
485 opcode(name + "2", output_size, output_type,
486 [2, 2], [src_type, src_type], False, _2src_commutative,
487 final(reduce_(src0, src1)))
488 opcode(name + "3", output_size, output_type,
489 [3, 3], [src_type, src_type], False, _2src_commutative,
490 final(reduce_(reduce_(src0, src1), src2)))
491 opcode(name + "4", output_size, output_type,
492 [4, 4], [src_type, src_type], False, _2src_commutative,
493 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
494
495 binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1")
496 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
497 binop("iadd_sat", tint, _2src_commutative, """
498 src1 > 0 ?
499 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
500 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
501 """)
502 binop("uadd_sat", tuint, _2src_commutative,
503 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
504 binop("isub_sat", tint, "", """
505 src1 < 0 ?
506 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
507 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
508 """)
509 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
510
511 binop("fsub", tfloat, "", "src0 - src1")
512 binop("isub", tint, "", "src0 - src1")
513
514 binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1")
515 # low 32-bits of signed/unsigned integer multiply
516 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
517
518 # Generate 64 bit result from 2 32 bits quantity
519 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
520 "(int64_t)src0 * (int64_t)src1")
521 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
522 "(uint64_t)src0 * (uint64_t)src1")
523
524 # high 32-bits of signed integer multiply
525 binop("imul_high", tint, _2src_commutative, """
526 if (bit_size == 64) {
527 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
528 * extension to work properly. The casts are kind-of annoying but needed
529 * to prevent compiler warnings.
530 */
531 uint32_t src0_u32[4] = {
532 src0,
533 (int64_t)src0 >> 32,
534 (int64_t)src0 >> 63,
535 (int64_t)src0 >> 63,
536 };
537 uint32_t src1_u32[4] = {
538 src1,
539 (int64_t)src1 >> 32,
540 (int64_t)src1 >> 63,
541 (int64_t)src1 >> 63,
542 };
543 uint32_t prod_u32[4];
544 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
545 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
546 } else {
547 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
548 }
549 """)
550
551 # high 32-bits of unsigned integer multiply
552 binop("umul_high", tuint, _2src_commutative, """
553 if (bit_size == 64) {
554 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
555 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
556 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
557 uint32_t prod_u32[4];
558 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
559 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
560 } else {
561 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
562 }
563 """)
564
565 # low 32-bits of unsigned integer multiply
566 binop("umul_low", tuint32, _2src_commutative, """
567 uint64_t mask = (1 << (bit_size / 2)) - 1;
568 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
569 """)
570
571
572 binop("fdiv", tfloat, "", "src0 / src1")
573 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
574 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
575
576 # returns a boolean representing the carry resulting from the addition of
577 # the two unsigned arguments.
578
579 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
580
581 # returns a boolean representing the borrow resulting from the subtraction
582 # of the two unsigned arguments.
583
584 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
585
586 # hadd: (a + b) >> 1 (without overflow)
587 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
588 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
589 # = 2 * (x & y) + (x & ~y) + (~x & y)
590 # = ((x & y) << 1) + (x ^ y)
591 #
592 # Since we know that the bottom bit of (x & y) << 1 is zero,
593 #
594 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
595 # = (x & y) + ((x ^ y) >> 1)
596 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
597 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
598
599 # rhadd: (a + b + 1) >> 1 (without overflow)
600 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
601 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
602 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
603 # = ((x | y) << 1) - (x ^ y) + 1
604 #
605 # Since we know that the bottom bit of (x & y) << 1 is zero,
606 #
607 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
608 # = (x | y) - ((x ^ y) >> 1)
609 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
610 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
611
612 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
613
614 # For signed integers, there are several different possible definitions of
615 # "modulus" or "remainder". We follow the conventions used by LLVM and
616 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
617 # operation while the imod opcode implements the more mathematical
618 # "modulus" operation. For details on the difference, see
619 #
620 # http://mathforum.org/library/drmath/view/52343.html
621
622 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
623 binop("imod", tint, "",
624 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
625 " src0 % src1 : src0 % src1 + src1)")
626 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
627 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
628
629 #
630 # Comparisons
631 #
632
633
634 # these integer-aware comparisons return a boolean (0 or ~0)
635
636 binop_compare("flt", tfloat, "", "src0 < src1")
637 binop_compare("fge", tfloat, "", "src0 >= src1")
638 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
639 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
640 binop_compare("ilt", tint, "", "src0 < src1")
641 binop_compare("ige", tint, "", "src0 >= src1")
642 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
643 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
644 binop_compare("ult", tuint, "", "src0 < src1")
645 binop_compare("uge", tuint, "", "src0 >= src1")
646 binop_compare32("flt32", tfloat, "", "src0 < src1")
647 binop_compare32("fge32", tfloat, "", "src0 >= src1")
648 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
649 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
650 binop_compare32("ilt32", tint, "", "src0 < src1")
651 binop_compare32("ige32", tint, "", "src0 >= src1")
652 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
653 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
654 binop_compare32("ult32", tuint, "", "src0 < src1")
655 binop_compare32("uge32", tuint, "", "src0 >= src1")
656
657 # integer-aware GLSL-style comparisons that compare floats and ints
658
659 binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}",
660 "{src0} && {src1}", "{src}")
661 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
662 "{src0} || {src1}", "{src}")
663 binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}",
664 "{src0} && {src1}", "{src}")
665 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
666 "{src0} || {src1}", "{src}")
667
668 binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}",
669 "{src0} && {src1}", "{src}")
670 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
671 "{src0} || {src1}", "{src}")
672 binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}",
673 "{src0} && {src1}", "{src}")
674 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
675 "{src0} || {src1}", "{src}")
676
677 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
678
679 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
680 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
681 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
682 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
683
684 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
685 # and false respectively
686
687 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
688 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
689 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
690 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
691
692 # SPIRV shifts are undefined for shift-operands >= bitsize,
693 # but SM5 shifts are defined to use the least significant bits, only
694 # The NIR definition is according to the SM5 specification.
695 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
696 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
697 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
698 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
699 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
700 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
701
702 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
703 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
704 dst = (src0 << (src1 & rotate_mask)) |
705 (src0 >> (-src1 & rotate_mask));
706 """)
707 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
708 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
709 dst = (src0 >> (src1 & rotate_mask)) |
710 (src0 << (-src1 & rotate_mask));
711 """)
712
713 # bitwise logic operators
714 #
715 # These are also used as boolean and, or, xor for hardware supporting
716 # integers.
717
718
719 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
720 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
721 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
722
723
724 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
725 "{src}")
726
727 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
728 "{src0} * {src1}", "{src0} + {src1}", "{src}")
729
730 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
731 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
732 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
733 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
734
735 binop("fmin", tfloat, "", "fminf(src0, src1)")
736 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
737 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
738 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
739 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
740 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
741
742 # Saturated vector add for 4 8bit ints.
743 binop("usadd_4x8", tint32, _2src_commutative + associative, """
744 dst = 0;
745 for (int i = 0; i < 32; i += 8) {
746 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
747 }
748 """)
749
750 # Saturated vector subtract for 4 8bit ints.
751 binop("ussub_4x8", tint32, "", """
752 dst = 0;
753 for (int i = 0; i < 32; i += 8) {
754 int src0_chan = (src0 >> i) & 0xff;
755 int src1_chan = (src1 >> i) & 0xff;
756 if (src0_chan > src1_chan)
757 dst |= (src0_chan - src1_chan) << i;
758 }
759 """)
760
761 # vector min for 4 8bit ints.
762 binop("umin_4x8", tint32, _2src_commutative + associative, """
763 dst = 0;
764 for (int i = 0; i < 32; i += 8) {
765 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
766 }
767 """)
768
769 # vector max for 4 8bit ints.
770 binop("umax_4x8", tint32, _2src_commutative + associative, """
771 dst = 0;
772 for (int i = 0; i < 32; i += 8) {
773 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
774 }
775 """)
776
777 # unorm multiply: (a * b) / 255.
778 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
779 dst = 0;
780 for (int i = 0; i < 32; i += 8) {
781 int src0_chan = (src0 >> i) & 0xff;
782 int src1_chan = (src1 >> i) & 0xff;
783 dst |= ((src0_chan * src1_chan) / 255) << i;
784 }
785 """)
786
787 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
788
789 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
790 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
791
792 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
793 "src0 | ((uint64_t)src1 << 32)")
794
795 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
796 "src0 | ((uint32_t)src1 << 16)")
797
798 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
799 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
800 # are from the low five bits of src0 and src1, respectively.
801 binop_convert("bfm", tuint32, tint32, "", """
802 int bits = src0 & 0x1F;
803 int offset = src1 & 0x1F;
804 dst = ((1u << bits) - 1) << offset;
805 """)
806
807 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
808 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
809 /* flush denormals to zero. */
810 if (!isnormal(dst))
811 dst = copysignf(0.0f, src0);
812 """)
813
814 # Combines the first component of each input to make a 2-component vector.
815
816 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
817 dst.x = src0.x;
818 dst.y = src1.x;
819 """)
820
821 # Byte extraction
822 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
823 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
824
825 # Word extraction
826 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
827 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
828
829
830 def triop(name, ty, alg_props, const_expr):
831 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
832 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
833 opcode(name, output_size, tuint,
834 [src1_size, src2_size, src3_size],
835 [tuint, tuint, tuint], False, "", const_expr)
836
837 triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2")
838
839 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
840
841 # Conditional Select
842 #
843 # A vector conditional select instruction (like ?:, but operating per-
844 # component on vectors). There are two versions, one for floating point
845 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
846
847
848 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
849
850 # 3 way min/max/med
851 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
852 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
853 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
854
855 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
856 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
857 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
858
859 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
860 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
861 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
862
863 opcode("bcsel", 0, tuint, [0, 0, 0],
864 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
865 opcode("b32csel", 0, tuint, [0, 0, 0],
866 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
867
868 # SM5 bfi assembly
869 triop("bfi", tuint32, "", """
870 unsigned mask = src0, insert = src1, base = src2;
871 if (mask == 0) {
872 dst = base;
873 } else {
874 unsigned tmp = mask;
875 while (!(tmp & 1)) {
876 tmp >>= 1;
877 insert <<= 1;
878 }
879 dst = (base & ~mask) | (insert & mask);
880 }
881 """)
882
883
884 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
885
886 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
887 opcode("ubfe", 0, tuint32,
888 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
889 unsigned base = src0;
890 unsigned offset = src1 & 0x1F;
891 unsigned bits = src2 & 0x1F;
892 if (bits == 0) {
893 dst = 0;
894 } else if (offset + bits < 32) {
895 dst = (base << (32 - bits - offset)) >> (32 - bits);
896 } else {
897 dst = base >> offset;
898 }
899 """)
900 opcode("ibfe", 0, tint32,
901 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
902 int base = src0;
903 unsigned offset = src1 & 0x1F;
904 unsigned bits = src2 & 0x1F;
905 if (bits == 0) {
906 dst = 0;
907 } else if (offset + bits < 32) {
908 dst = (base << (32 - bits - offset)) >> (32 - bits);
909 } else {
910 dst = base >> offset;
911 }
912 """)
913
914 # GLSL bitfieldExtract()
915 opcode("ubitfield_extract", 0, tuint32,
916 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
917 unsigned base = src0;
918 int offset = src1, bits = src2;
919 if (bits == 0) {
920 dst = 0;
921 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
922 dst = 0; /* undefined per the spec */
923 } else {
924 dst = (base >> offset) & ((1ull << bits) - 1);
925 }
926 """)
927 opcode("ibitfield_extract", 0, tint32,
928 [0, 0, 0], [tint32, tint32, tint32], False, "", """
929 int base = src0;
930 int offset = src1, bits = src2;
931 if (bits == 0) {
932 dst = 0;
933 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
934 dst = 0;
935 } else {
936 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
937 }
938 """)
939
940 # Combines the first component of each input to make a 3-component vector.
941
942 triop_horiz("vec3", 3, 1, 1, 1, """
943 dst.x = src0.x;
944 dst.y = src1.x;
945 dst.z = src2.x;
946 """)
947
948 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
949 src4_size, const_expr):
950 opcode(name, output_size, tuint,
951 [src1_size, src2_size, src3_size, src4_size],
952 [tuint, tuint, tuint, tuint],
953 False, "", const_expr)
954
955 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
956 [tuint32, tuint32, tint32, tint32], False, "", """
957 unsigned base = src0, insert = src1;
958 int offset = src2, bits = src3;
959 if (bits == 0) {
960 dst = base;
961 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
962 dst = 0;
963 } else {
964 unsigned mask = ((1ull << bits) - 1) << offset;
965 dst = (base & ~mask) | ((insert << offset) & mask);
966 }
967 """)
968
969 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
970 dst.x = src0.x;
971 dst.y = src1.x;
972 dst.z = src2.x;
973 dst.w = src3.x;
974 """)
975
976 # ir3-specific instruction that maps directly to mul-add shift high mix,
977 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
978 # multiplication (imul) on Freedreno backend..
979 opcode("imadsh_mix16", 1, tint32,
980 [1, 1, 1], [tint32, tint32, tint32], False, "", """
981 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
982 """)