nir/builder: Add nir_b2i
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool32 = "bool32"
99 tuint = "uint"
100 tuint16 = "uint16"
101 tfloat32 = "float32"
102 tint32 = "int32"
103 tuint32 = "uint32"
104 tint64 = "int64"
105 tuint64 = "uint64"
106 tfloat64 = "float64"
107
108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
109
110 def type_has_size(type_):
111 m = _TYPE_SPLIT_RE.match(type_)
112 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
113 return m.group('bits') is not None
114
115 def type_size(type_):
116 m = _TYPE_SPLIT_RE.match(type_)
117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118 assert m.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_)
120 return int(m.group('bits'))
121
122 def type_sizes(type_):
123 if type_has_size(type_):
124 return [type_size(type_)]
125 elif type_ == 'bool':
126 return [1, 32]
127 elif type_ == 'float':
128 return [16, 32, 64]
129 else:
130 return [1, 8, 16, 32, 64]
131
132 def type_base_type(type_):
133 m = _TYPE_SPLIT_RE.match(type_)
134 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
135 return m.group('type')
136
137 # Operation where the first two sources are commutative.
138 #
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
141 # sources.
142 _2src_commutative = "2src_commutative "
143 associative = "associative "
144
145 # global dictionary of opcodes
146 opcodes = {}
147
148 def opcode(name, output_size, output_type, input_sizes, input_types,
149 is_conversion, algebraic_properties, const_expr):
150 assert name not in opcodes
151 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
152 input_types, is_conversion, algebraic_properties,
153 const_expr)
154
155 def unop_convert(name, out_type, in_type, const_expr):
156 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
157
158 def unop(name, ty, const_expr):
159 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
160
161 def unop_horiz(name, output_size, output_type, input_size, input_type,
162 const_expr):
163 opcode(name, output_size, output_type, [input_size], [input_type],
164 False, "", const_expr)
165
166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
167 reduce_expr, final_expr):
168 def prereduce(src):
169 return "(" + prereduce_expr.format(src=src) + ")"
170 def final(src):
171 return final_expr.format(src="(" + src + ")")
172 def reduce_(src0, src1):
173 return reduce_expr.format(src0=src0, src1=src1)
174 src0 = prereduce("src0.x")
175 src1 = prereduce("src0.y")
176 src2 = prereduce("src0.z")
177 src3 = prereduce("src0.w")
178 unop_horiz(name + "2", output_size, output_type, 2, input_type,
179 final(reduce_(src0, src1)))
180 unop_horiz(name + "3", output_size, output_type, 3, input_type,
181 final(reduce_(reduce_(src0, src1), src2)))
182 unop_horiz(name + "4", output_size, output_type, 4, input_type,
183 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
184
185 def unop_numeric_convert(name, out_type, in_type, const_expr):
186 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
187
188 unop("mov", tuint, "src0")
189
190 unop("ineg", tint, "-src0")
191 unop("fneg", tfloat, "-src0")
192 unop("inot", tint, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat, "fabs(src0)")
199 unop("fsat", tfloat, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat, "exp2f(src0)")
206 unop("flog2", tfloat, "log2f(src0)")
207
208 # Generate all of the numeric conversion opcodes
209 for src_t in [tint, tuint, tfloat, tbool]:
210 if src_t == tbool:
211 dst_types = [tfloat, tint]
212 elif src_t == tint:
213 dst_types = [tfloat, tint, tbool]
214 elif src_t == tuint:
215 dst_types = [tfloat, tuint]
216 elif src_t == tfloat:
217 dst_types = [tint, tuint, tfloat, tbool]
218
219 for dst_t in dst_types:
220 for bit_size in type_sizes(dst_t):
221 if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
222 rnd_modes = ['_rtne', '_rtz', '']
223 for rnd_mode in rnd_modes:
224 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
225 bit_size, rnd_mode),
226 dst_t + str(bit_size), src_t, "src0")
227 else:
228 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
229 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
230 dst_t + str(bit_size), src_t, conv_expr)
231
232
233 # Unary floating-point rounding operations.
234
235
236 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
237 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
238 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
239 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
240 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
241
242 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
243
244 # Trigonometric operations.
245
246
247 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
248 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
249
250 # dfrexp
251 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
252 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
253
254 # Partial derivatives.
255
256
257 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
258 unop("fddy", tfloat, "0.0")
259 unop("fddx_fine", tfloat, "0.0")
260 unop("fddy_fine", tfloat, "0.0")
261 unop("fddx_coarse", tfloat, "0.0")
262 unop("fddy_coarse", tfloat, "0.0")
263
264
265 # Floating point pack and unpack operations.
266
267 def pack_2x16(fmt):
268 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
269 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
270 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
271 """.replace("fmt", fmt))
272
273 def pack_4x8(fmt):
274 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
275 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
276 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
277 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
278 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
279 """.replace("fmt", fmt))
280
281 def unpack_2x16(fmt):
282 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
283 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
284 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
285 """.replace("fmt", fmt))
286
287 def unpack_4x8(fmt):
288 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
289 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
290 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
291 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
292 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
293 """.replace("fmt", fmt))
294
295
296 pack_2x16("snorm")
297 pack_4x8("snorm")
298 pack_2x16("unorm")
299 pack_4x8("unorm")
300 pack_2x16("half")
301 unpack_2x16("snorm")
302 unpack_4x8("snorm")
303 unpack_2x16("unorm")
304 unpack_4x8("unorm")
305 unpack_2x16("half")
306
307 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
308 dst.x = (src0.x & 0xffff) | (src0.y << 16);
309 """)
310
311 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
312 dst.x = (src0.x << 0) |
313 (src0.y << 8) |
314 (src0.z << 16) |
315 (src0.w << 24);
316 """)
317
318 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
319 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
320
321 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
322 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
323
324 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
325 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
326
327 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
328 "dst.x = src0.x; dst.y = src0.x >> 32;")
329
330 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
331 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
332
333 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
334 "dst.x = src0.x; dst.y = src0.x >> 16;")
335
336 # Lowered floating point unpacking operations.
337
338
339 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
340 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
341 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
342 "unpack_half_1x16((uint16_t)(src0 >> 16))")
343
344 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
345 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
346
347 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
348 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
349
350 # Bit operations, part of ARB_gpu_shader5.
351
352
353 unop("bitfield_reverse", tuint32, """
354 /* we're not winning any awards for speed here, but that's ok */
355 dst = 0;
356 for (unsigned bit = 0; bit < 32; bit++)
357 dst |= ((src0 >> bit) & 1) << (31 - bit);
358 """)
359 unop_convert("bit_count", tuint32, tuint, """
360 dst = 0;
361 for (unsigned bit = 0; bit < bit_size; bit++) {
362 if ((src0 >> bit) & 1)
363 dst++;
364 }
365 """)
366
367 unop_convert("ufind_msb", tint32, tuint, """
368 dst = -1;
369 for (int bit = bit_size - 1; bit >= 0; bit--) {
370 if ((src0 >> bit) & 1) {
371 dst = bit;
372 break;
373 }
374 }
375 """)
376
377 unop("ifind_msb", tint32, """
378 dst = -1;
379 for (int bit = 31; bit >= 0; bit--) {
380 /* If src0 < 0, we're looking for the first 0 bit.
381 * if src0 >= 0, we're looking for the first 1 bit.
382 */
383 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
384 (!((src0 >> bit) & 1) && (src0 < 0))) {
385 dst = bit;
386 break;
387 }
388 }
389 """)
390
391 unop_convert("find_lsb", tint32, tint, """
392 dst = -1;
393 for (unsigned bit = 0; bit < bit_size; bit++) {
394 if ((src0 >> bit) & 1) {
395 dst = bit;
396 break;
397 }
398 }
399 """)
400
401
402 for i in range(1, 5):
403 for j in range(1, 5):
404 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
405
406
407 # AMD_gcn_shader extended instructions
408 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
409 dst.x = dst.y = 0.0;
410 float absX = fabs(src0.x);
411 float absY = fabs(src0.y);
412 float absZ = fabs(src0.z);
413
414 float ma = 0.0;
415 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
416 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
417 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
418
419 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
420 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
421 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
422 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
423 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
424 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
425
426 dst.x = dst.x / ma + 0.5;
427 dst.y = dst.y / ma + 0.5;
428 """)
429
430 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
431 float absX = fabs(src0.x);
432 float absY = fabs(src0.y);
433 float absZ = fabs(src0.z);
434 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
435 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
436 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
437 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
438 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
439 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
440 """)
441
442 # Sum of vector components
443 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
444
445 def binop_convert(name, out_type, in_type, alg_props, const_expr):
446 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
447 False, alg_props, const_expr)
448
449 def binop(name, ty, alg_props, const_expr):
450 binop_convert(name, ty, ty, alg_props, const_expr)
451
452 def binop_compare(name, ty, alg_props, const_expr):
453 binop_convert(name, tbool1, ty, alg_props, const_expr)
454
455 def binop_compare32(name, ty, alg_props, const_expr):
456 binop_convert(name, tbool32, ty, alg_props, const_expr)
457
458 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
459 src2_type, const_expr):
460 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
461 False, "", const_expr)
462
463 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
464 reduce_expr, final_expr):
465 def final(src):
466 return final_expr.format(src= "(" + src + ")")
467 def reduce_(src0, src1):
468 return reduce_expr.format(src0=src0, src1=src1)
469 def prereduce(src0, src1):
470 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
471 src0 = prereduce("src0.x", "src1.x")
472 src1 = prereduce("src0.y", "src1.y")
473 src2 = prereduce("src0.z", "src1.z")
474 src3 = prereduce("src0.w", "src1.w")
475 opcode(name + "2", output_size, output_type,
476 [2, 2], [src_type, src_type], False, _2src_commutative,
477 final(reduce_(src0, src1)))
478 opcode(name + "3", output_size, output_type,
479 [3, 3], [src_type, src_type], False, _2src_commutative,
480 final(reduce_(reduce_(src0, src1), src2)))
481 opcode(name + "4", output_size, output_type,
482 [4, 4], [src_type, src_type], False, _2src_commutative,
483 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
484
485 binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1")
486 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
487 binop("iadd_sat", tint, _2src_commutative, """
488 src1 > 0 ?
489 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
490 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
491 """)
492 binop("uadd_sat", tuint, _2src_commutative,
493 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
494 binop("isub_sat", tint, "", """
495 src1 < 0 ?
496 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
497 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
498 """)
499 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
500
501 binop("fsub", tfloat, "", "src0 - src1")
502 binop("isub", tint, "", "src0 - src1")
503
504 binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1")
505 # low 32-bits of signed/unsigned integer multiply
506 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
507
508 # Generate 64 bit result from 2 32 bits quantity
509 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
510 "(int64_t)src0 * (int64_t)src1")
511 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
512 "(uint64_t)src0 * (uint64_t)src1")
513
514 # high 32-bits of signed integer multiply
515 binop("imul_high", tint, _2src_commutative, """
516 if (bit_size == 64) {
517 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
518 * extension to work properly. The casts are kind-of annoying but needed
519 * to prevent compiler warnings.
520 */
521 uint32_t src0_u32[4] = {
522 src0,
523 (int64_t)src0 >> 32,
524 (int64_t)src0 >> 63,
525 (int64_t)src0 >> 63,
526 };
527 uint32_t src1_u32[4] = {
528 src1,
529 (int64_t)src1 >> 32,
530 (int64_t)src1 >> 63,
531 (int64_t)src1 >> 63,
532 };
533 uint32_t prod_u32[4];
534 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
535 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
536 } else {
537 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
538 }
539 """)
540
541 # high 32-bits of unsigned integer multiply
542 binop("umul_high", tuint, _2src_commutative, """
543 if (bit_size == 64) {
544 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
545 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
546 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
547 uint32_t prod_u32[4];
548 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
549 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
550 } else {
551 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
552 }
553 """)
554
555 # low 32-bits of unsigned integer multiply
556 binop("umul_low", tuint32, _2src_commutative, """
557 uint64_t mask = (1 << (bit_size / 2)) - 1;
558 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
559 """)
560
561
562 binop("fdiv", tfloat, "", "src0 / src1")
563 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
564 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
565
566 # returns a boolean representing the carry resulting from the addition of
567 # the two unsigned arguments.
568
569 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
570
571 # returns a boolean representing the borrow resulting from the subtraction
572 # of the two unsigned arguments.
573
574 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
575
576 # hadd: (a + b) >> 1 (without overflow)
577 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
578 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
579 # = 2 * (x & y) + (x & ~y) + (~x & y)
580 # = ((x & y) << 1) + (x ^ y)
581 #
582 # Since we know that the bottom bit of (x & y) << 1 is zero,
583 #
584 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
585 # = (x & y) + ((x ^ y) >> 1)
586 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
587 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
588
589 # rhadd: (a + b + 1) >> 1 (without overflow)
590 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
591 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
592 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
593 # = ((x | y) << 1) - (x ^ y) + 1
594 #
595 # Since we know that the bottom bit of (x & y) << 1 is zero,
596 #
597 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
598 # = (x | y) - ((x ^ y) >> 1)
599 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
600 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
601
602 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
603
604 # For signed integers, there are several different possible definitions of
605 # "modulus" or "remainder". We follow the conventions used by LLVM and
606 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
607 # operation while the imod opcode implements the more mathematical
608 # "modulus" operation. For details on the difference, see
609 #
610 # http://mathforum.org/library/drmath/view/52343.html
611
612 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
613 binop("imod", tint, "",
614 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
615 " src0 % src1 : src0 % src1 + src1)")
616 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
617 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
618
619 #
620 # Comparisons
621 #
622
623
624 # these integer-aware comparisons return a boolean (0 or ~0)
625
626 binop_compare("flt", tfloat, "", "src0 < src1")
627 binop_compare("fge", tfloat, "", "src0 >= src1")
628 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
629 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
630 binop_compare("ilt", tint, "", "src0 < src1")
631 binop_compare("ige", tint, "", "src0 >= src1")
632 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
633 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
634 binop_compare("ult", tuint, "", "src0 < src1")
635 binop_compare("uge", tuint, "", "src0 >= src1")
636 binop_compare32("flt32", tfloat, "", "src0 < src1")
637 binop_compare32("fge32", tfloat, "", "src0 >= src1")
638 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
639 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
640 binop_compare32("ilt32", tint, "", "src0 < src1")
641 binop_compare32("ige32", tint, "", "src0 >= src1")
642 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
643 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
644 binop_compare32("ult32", tuint, "", "src0 < src1")
645 binop_compare32("uge32", tuint, "", "src0 >= src1")
646
647 # integer-aware GLSL-style comparisons that compare floats and ints
648
649 binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}",
650 "{src0} && {src1}", "{src}")
651 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
652 "{src0} || {src1}", "{src}")
653 binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}",
654 "{src0} && {src1}", "{src}")
655 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
656 "{src0} || {src1}", "{src}")
657
658 binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}",
659 "{src0} && {src1}", "{src}")
660 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
661 "{src0} || {src1}", "{src}")
662 binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}",
663 "{src0} && {src1}", "{src}")
664 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
665 "{src0} || {src1}", "{src}")
666
667 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
668
669 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
670 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
671 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
672 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
673
674 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
675 # and false respectively
676
677 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
678 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
679 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
680 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
681
682 # SPIRV shifts are undefined for shift-operands >= bitsize,
683 # but SM5 shifts are defined to use the least significant bits, only
684 # The NIR definition is according to the SM5 specification.
685 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
686 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
687 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
688 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
689 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
690 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
691
692 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
693 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
694 dst = (src0 << (src1 & rotate_mask)) |
695 (src0 >> (-src1 & rotate_mask));
696 """)
697 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
698 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
699 dst = (src0 >> (src1 & rotate_mask)) |
700 (src0 << (-src1 & rotate_mask));
701 """)
702
703 # bitwise logic operators
704 #
705 # These are also used as boolean and, or, xor for hardware supporting
706 # integers.
707
708
709 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
710 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
711 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
712
713
714 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
715 "{src}")
716
717 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
718 "{src0} * {src1}", "{src0} + {src1}", "{src}")
719
720 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
721 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
722 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
723 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
724
725 binop("fmin", tfloat, "", "fminf(src0, src1)")
726 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
727 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
728 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
729 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
730 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
731
732 # Saturated vector add for 4 8bit ints.
733 binop("usadd_4x8", tint32, _2src_commutative + associative, """
734 dst = 0;
735 for (int i = 0; i < 32; i += 8) {
736 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
737 }
738 """)
739
740 # Saturated vector subtract for 4 8bit ints.
741 binop("ussub_4x8", tint32, "", """
742 dst = 0;
743 for (int i = 0; i < 32; i += 8) {
744 int src0_chan = (src0 >> i) & 0xff;
745 int src1_chan = (src1 >> i) & 0xff;
746 if (src0_chan > src1_chan)
747 dst |= (src0_chan - src1_chan) << i;
748 }
749 """)
750
751 # vector min for 4 8bit ints.
752 binop("umin_4x8", tint32, _2src_commutative + associative, """
753 dst = 0;
754 for (int i = 0; i < 32; i += 8) {
755 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
756 }
757 """)
758
759 # vector max for 4 8bit ints.
760 binop("umax_4x8", tint32, _2src_commutative + associative, """
761 dst = 0;
762 for (int i = 0; i < 32; i += 8) {
763 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
764 }
765 """)
766
767 # unorm multiply: (a * b) / 255.
768 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
769 dst = 0;
770 for (int i = 0; i < 32; i += 8) {
771 int src0_chan = (src0 >> i) & 0xff;
772 int src1_chan = (src1 >> i) & 0xff;
773 dst |= ((src0_chan * src1_chan) / 255) << i;
774 }
775 """)
776
777 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
778
779 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
780 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
781
782 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
783 "src0 | ((uint64_t)src1 << 32)")
784
785 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
786 "src0 | ((uint32_t)src1 << 16)")
787
788 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
789 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
790 # are from the low five bits of src0 and src1, respectively.
791 binop_convert("bfm", tuint32, tint32, "", """
792 int bits = src0 & 0x1F;
793 int offset = src1 & 0x1F;
794 dst = ((1u << bits) - 1) << offset;
795 """)
796
797 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
798 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
799 /* flush denormals to zero. */
800 if (!isnormal(dst))
801 dst = copysignf(0.0f, src0);
802 """)
803
804 # Combines the first component of each input to make a 2-component vector.
805
806 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
807 dst.x = src0.x;
808 dst.y = src1.x;
809 """)
810
811 # Byte extraction
812 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
813 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
814
815 # Word extraction
816 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
817 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
818
819
820 def triop(name, ty, alg_props, const_expr):
821 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
822 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
823 opcode(name, output_size, tuint,
824 [src1_size, src2_size, src3_size],
825 [tuint, tuint, tuint], False, "", const_expr)
826
827 triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2")
828
829 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
830
831 # Conditional Select
832 #
833 # A vector conditional select instruction (like ?:, but operating per-
834 # component on vectors). There are two versions, one for floating point
835 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
836
837
838 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
839
840 # 3 way min/max/med
841 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
842 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
843 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
844
845 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
846 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
847 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
848
849 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
850 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
851 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
852
853 opcode("bcsel", 0, tuint, [0, 0, 0],
854 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
855 opcode("b32csel", 0, tuint, [0, 0, 0],
856 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
857
858 # SM5 bfi assembly
859 triop("bfi", tuint32, "", """
860 unsigned mask = src0, insert = src1, base = src2;
861 if (mask == 0) {
862 dst = base;
863 } else {
864 unsigned tmp = mask;
865 while (!(tmp & 1)) {
866 tmp >>= 1;
867 insert <<= 1;
868 }
869 dst = (base & ~mask) | (insert & mask);
870 }
871 """)
872
873
874 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
875
876 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
877 opcode("ubfe", 0, tuint32,
878 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
879 unsigned base = src0;
880 unsigned offset = src1 & 0x1F;
881 unsigned bits = src2 & 0x1F;
882 if (bits == 0) {
883 dst = 0;
884 } else if (offset + bits < 32) {
885 dst = (base << (32 - bits - offset)) >> (32 - bits);
886 } else {
887 dst = base >> offset;
888 }
889 """)
890 opcode("ibfe", 0, tint32,
891 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
892 int base = src0;
893 unsigned offset = src1 & 0x1F;
894 unsigned bits = src2 & 0x1F;
895 if (bits == 0) {
896 dst = 0;
897 } else if (offset + bits < 32) {
898 dst = (base << (32 - bits - offset)) >> (32 - bits);
899 } else {
900 dst = base >> offset;
901 }
902 """)
903
904 # GLSL bitfieldExtract()
905 opcode("ubitfield_extract", 0, tuint32,
906 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
907 unsigned base = src0;
908 int offset = src1, bits = src2;
909 if (bits == 0) {
910 dst = 0;
911 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
912 dst = 0; /* undefined per the spec */
913 } else {
914 dst = (base >> offset) & ((1ull << bits) - 1);
915 }
916 """)
917 opcode("ibitfield_extract", 0, tint32,
918 [0, 0, 0], [tint32, tint32, tint32], False, "", """
919 int base = src0;
920 int offset = src1, bits = src2;
921 if (bits == 0) {
922 dst = 0;
923 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
924 dst = 0;
925 } else {
926 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
927 }
928 """)
929
930 # Combines the first component of each input to make a 3-component vector.
931
932 triop_horiz("vec3", 3, 1, 1, 1, """
933 dst.x = src0.x;
934 dst.y = src1.x;
935 dst.z = src2.x;
936 """)
937
938 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
939 src4_size, const_expr):
940 opcode(name, output_size, tuint,
941 [src1_size, src2_size, src3_size, src4_size],
942 [tuint, tuint, tuint, tuint],
943 False, "", const_expr)
944
945 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
946 [tuint32, tuint32, tint32, tint32], False, "", """
947 unsigned base = src0, insert = src1;
948 int offset = src2, bits = src3;
949 if (bits == 0) {
950 dst = base;
951 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
952 dst = 0;
953 } else {
954 unsigned mask = ((1ull << bits) - 1) << offset;
955 dst = (base & ~mask) | ((insert << offset) & mask);
956 }
957 """)
958
959 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
960 dst.x = src0.x;
961 dst.y = src1.x;
962 dst.z = src2.x;
963 dst.w = src3.x;
964 """)
965
966 # ir3-specific instruction that maps directly to mul-add shift high mix,
967 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
968 # multiplication (imul) on Freedreno backend..
969 opcode("imadsh_mix16", 1, tint32,
970 [1, 1, 1], [tint32, tint32, tint32], False, "", """
971 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
972 """)