nir/glsl: Add another way of doing lower_imul64 for gen8+
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
46
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
56
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
65 """
66 assert isinstance(name, str)
67 assert isinstance(output_size, int)
68 assert isinstance(output_type, str)
69 assert isinstance(input_sizes, list)
70 assert isinstance(input_sizes[0], int)
71 assert isinstance(input_types, list)
72 assert isinstance(input_types[0], str)
73 assert isinstance(algebraic_properties, str)
74 assert isinstance(const_expr, str)
75 assert len(input_sizes) == len(input_types)
76 assert 0 <= output_size <= 4
77 for size in input_sizes:
78 assert 0 <= size <= 4
79 if output_size != 0:
80 assert size != 0
81 self.name = name
82 self.num_inputs = len(input_sizes)
83 self.output_size = output_size
84 self.output_type = output_type
85 self.input_sizes = input_sizes
86 self.input_types = input_types
87 self.algebraic_properties = algebraic_properties
88 self.const_expr = const_expr
89
90 # helper variables for strings
91 tfloat = "float"
92 tint = "int"
93 tbool = "bool"
94 tbool1 = "bool1"
95 tbool32 = "bool32"
96 tuint = "uint"
97 tuint16 = "uint16"
98 tfloat32 = "float32"
99 tint32 = "int32"
100 tuint32 = "uint32"
101 tint64 = "int64"
102 tuint64 = "uint64"
103 tfloat64 = "float64"
104
105 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
106
107 def type_has_size(type_):
108 m = _TYPE_SPLIT_RE.match(type_)
109 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
110 return m.group('bits') is not None
111
112 def type_size(type_):
113 m = _TYPE_SPLIT_RE.match(type_)
114 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
115 assert m.group('bits') is not None, \
116 'NIR type string has no bit size: "{}"'.format(type_)
117 return int(m.group('bits'))
118
119 def type_sizes(type_):
120 if type_has_size(type_):
121 return [type_size(type_)]
122 elif type_ == 'bool':
123 return [1, 32]
124 elif type_ == 'float':
125 return [16, 32, 64]
126 else:
127 return [1, 8, 16, 32, 64]
128
129 def type_base_type(type_):
130 m = _TYPE_SPLIT_RE.match(type_)
131 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
132 return m.group('type')
133
134 commutative = "commutative "
135 associative = "associative "
136
137 # global dictionary of opcodes
138 opcodes = {}
139
140 def opcode(name, output_size, output_type, input_sizes, input_types,
141 algebraic_properties, const_expr):
142 assert name not in opcodes
143 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
144 input_types, algebraic_properties, const_expr)
145
146 def unop_convert(name, out_type, in_type, const_expr):
147 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
148
149 def unop(name, ty, const_expr):
150 opcode(name, 0, ty, [0], [ty], "", const_expr)
151
152 def unop_horiz(name, output_size, output_type, input_size, input_type,
153 const_expr):
154 opcode(name, output_size, output_type, [input_size], [input_type], "",
155 const_expr)
156
157 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
158 reduce_expr, final_expr):
159 def prereduce(src):
160 return "(" + prereduce_expr.format(src=src) + ")"
161 def final(src):
162 return final_expr.format(src="(" + src + ")")
163 def reduce_(src0, src1):
164 return reduce_expr.format(src0=src0, src1=src1)
165 src0 = prereduce("src0.x")
166 src1 = prereduce("src0.y")
167 src2 = prereduce("src0.z")
168 src3 = prereduce("src0.w")
169 unop_horiz(name + "2", output_size, output_type, 2, input_type,
170 final(reduce_(src0, src1)))
171 unop_horiz(name + "3", output_size, output_type, 3, input_type,
172 final(reduce_(reduce_(src0, src1), src2)))
173 unop_horiz(name + "4", output_size, output_type, 4, input_type,
174 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
175
176
177 # These two move instructions differ in what modifiers they support and what
178 # the negate modifier means. Otherwise, they are identical.
179 unop("fmov", tfloat, "src0")
180 unop("imov", tint, "src0")
181
182 unop("ineg", tint, "-src0")
183 unop("fneg", tfloat, "-src0")
184 unop("inot", tint, "~src0") # invert every bit of the integer
185 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
186 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
187 unop("fsign", tfloat, ("bit_size == 64 ? " +
188 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
189 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
190 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
191 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
192 unop("fabs", tfloat, "fabs(src0)")
193 unop("fsat", tfloat, ("bit_size == 64 ? " +
194 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
195 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
196 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
197 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
198 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
199 unop("fexp2", tfloat, "exp2f(src0)")
200 unop("flog2", tfloat, "log2f(src0)")
201
202 # Generate all of the numeric conversion opcodes
203 for src_t in [tint, tuint, tfloat, tbool]:
204 if src_t == tbool:
205 dst_types = [tfloat, tint]
206 elif src_t == tint:
207 dst_types = [tfloat, tint, tbool]
208 elif src_t == tuint:
209 dst_types = [tfloat, tuint]
210 elif src_t == tfloat:
211 dst_types = [tint, tuint, tfloat, tbool]
212
213 for dst_t in dst_types:
214 for bit_size in type_sizes(dst_t):
215 if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
216 rnd_modes = ['_rtne', '_rtz', '']
217 for rnd_mode in rnd_modes:
218 unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
219 bit_size, rnd_mode),
220 dst_t + str(bit_size), src_t, "src0")
221 else:
222 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
223 unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
224 dst_t + str(bit_size), src_t, conv_expr)
225
226
227 # Unary floating-point rounding operations.
228
229
230 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
231 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
232 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
233 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
234 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
235
236 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
237
238 # Trigonometric operations.
239
240
241 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
242 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
243
244 # dfrexp
245 unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);")
246 unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);")
247
248 # Partial derivatives.
249
250
251 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
252 unop("fddy", tfloat, "0.0")
253 unop("fddx_fine", tfloat, "0.0")
254 unop("fddy_fine", tfloat, "0.0")
255 unop("fddx_coarse", tfloat, "0.0")
256 unop("fddy_coarse", tfloat, "0.0")
257
258
259 # Floating point pack and unpack operations.
260
261 def pack_2x16(fmt):
262 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
263 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
264 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
265 """.replace("fmt", fmt))
266
267 def pack_4x8(fmt):
268 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
269 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
270 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
271 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
272 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
273 """.replace("fmt", fmt))
274
275 def unpack_2x16(fmt):
276 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
277 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
278 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
279 """.replace("fmt", fmt))
280
281 def unpack_4x8(fmt):
282 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
283 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
284 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
285 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
286 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
287 """.replace("fmt", fmt))
288
289
290 pack_2x16("snorm")
291 pack_4x8("snorm")
292 pack_2x16("unorm")
293 pack_4x8("unorm")
294 pack_2x16("half")
295 unpack_2x16("snorm")
296 unpack_4x8("snorm")
297 unpack_2x16("unorm")
298 unpack_4x8("unorm")
299 unpack_2x16("half")
300
301 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
302 dst.x = (src0.x & 0xffff) | (src0.y << 16);
303 """)
304
305 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
306 dst.x = (src0.x << 0) |
307 (src0.y << 8) |
308 (src0.z << 16) |
309 (src0.w << 24);
310 """)
311
312 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
313 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
314
315 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
316 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
317
318 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
319 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
320
321 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
322 "dst.x = src0.x; dst.y = src0.x >> 32;")
323
324 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
325 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
326
327 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
328 "dst.x = src0.x; dst.y = src0.x >> 16;")
329
330 # Lowered floating point unpacking operations.
331
332
333 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
334 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
335 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
336 "unpack_half_1x16((uint16_t)(src0 >> 16))")
337
338 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
339 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
340
341 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
342 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
343
344 # Bit operations, part of ARB_gpu_shader5.
345
346
347 unop("bitfield_reverse", tuint32, """
348 /* we're not winning any awards for speed here, but that's ok */
349 dst = 0;
350 for (unsigned bit = 0; bit < 32; bit++)
351 dst |= ((src0 >> bit) & 1) << (31 - bit);
352 """)
353 unop_convert("bit_count", tuint32, tuint, """
354 dst = 0;
355 for (unsigned bit = 0; bit < bit_size; bit++) {
356 if ((src0 >> bit) & 1)
357 dst++;
358 }
359 """)
360
361 unop_convert("ufind_msb", tint32, tuint, """
362 dst = -1;
363 for (int bit = bit_size - 1; bit >= 0; bit--) {
364 if ((src0 >> bit) & 1) {
365 dst = bit;
366 break;
367 }
368 }
369 """)
370
371 unop("ifind_msb", tint32, """
372 dst = -1;
373 for (int bit = 31; bit >= 0; bit--) {
374 /* If src0 < 0, we're looking for the first 0 bit.
375 * if src0 >= 0, we're looking for the first 1 bit.
376 */
377 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
378 (!((src0 >> bit) & 1) && (src0 < 0))) {
379 dst = bit;
380 break;
381 }
382 }
383 """)
384
385 unop_convert("find_lsb", tint32, tint, """
386 dst = -1;
387 for (unsigned bit = 0; bit < bit_size; bit++) {
388 if ((src0 >> bit) & 1) {
389 dst = bit;
390 break;
391 }
392 }
393 """)
394
395
396 for i in range(1, 5):
397 for j in range(1, 5):
398 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
399
400
401 # AMD_gcn_shader extended instructions
402 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
403 dst.x = dst.y = 0.0;
404 float absX = fabs(src0.x);
405 float absY = fabs(src0.y);
406 float absZ = fabs(src0.z);
407 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
408 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
409 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
410 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
411 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
412 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
413 """)
414
415 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
416 float absX = fabs(src0.x);
417 float absY = fabs(src0.y);
418 float absZ = fabs(src0.z);
419 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
420 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
421 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
422 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
423 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
424 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
425 """)
426
427
428 def binop_convert(name, out_type, in_type, alg_props, const_expr):
429 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
430
431 def binop(name, ty, alg_props, const_expr):
432 binop_convert(name, ty, ty, alg_props, const_expr)
433
434 def binop_compare(name, ty, alg_props, const_expr):
435 binop_convert(name, tbool1, ty, alg_props, const_expr)
436
437 def binop_compare32(name, ty, alg_props, const_expr):
438 binop_convert(name, tbool32, ty, alg_props, const_expr)
439
440 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
441 src2_type, const_expr):
442 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
443 "", const_expr)
444
445 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
446 reduce_expr, final_expr):
447 def final(src):
448 return final_expr.format(src= "(" + src + ")")
449 def reduce_(src0, src1):
450 return reduce_expr.format(src0=src0, src1=src1)
451 def prereduce(src0, src1):
452 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
453 src0 = prereduce("src0.x", "src1.x")
454 src1 = prereduce("src0.y", "src1.y")
455 src2 = prereduce("src0.z", "src1.z")
456 src3 = prereduce("src0.w", "src1.w")
457 opcode(name + "2", output_size, output_type,
458 [2, 2], [src_type, src_type], commutative,
459 final(reduce_(src0, src1)))
460 opcode(name + "3", output_size, output_type,
461 [3, 3], [src_type, src_type], commutative,
462 final(reduce_(reduce_(src0, src1), src2)))
463 opcode(name + "4", output_size, output_type,
464 [4, 4], [src_type, src_type], commutative,
465 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
466
467 binop("fadd", tfloat, commutative + associative, "src0 + src1")
468 binop("iadd", tint, commutative + associative, "src0 + src1")
469 binop("uadd_sat", tuint, commutative,
470 "(src0 + src1) < src0 ? UINT64_MAX : (src0 + src1)")
471 binop("fsub", tfloat, "", "src0 - src1")
472 binop("isub", tint, "", "src0 - src1")
473
474 binop("fmul", tfloat, commutative + associative, "src0 * src1")
475 # low 32-bits of signed/unsigned integer multiply
476 binop("imul", tint, commutative + associative, "src0 * src1")
477
478 # Generate 64 bit result from 2 32 bits quantity
479 binop_convert("imul_2x32_64", tint64, tint32, commutative,
480 "(int64_t)src0 * (int64_t)src1")
481 binop_convert("umul_2x32_64", tuint64, tuint32, commutative,
482 "(uint64_t)src0 * (uint64_t)src1")
483
484 # high 32-bits of signed integer multiply
485 binop("imul_high", tint, commutative, """
486 if (bit_size == 64) {
487 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
488 * extension to work properly. The casts are kind-of annoying but needed
489 * to prevent compiler warnings.
490 */
491 uint32_t src0_u32[4] = {
492 src0,
493 (int64_t)src0 >> 32,
494 (int64_t)src0 >> 63,
495 (int64_t)src0 >> 63,
496 };
497 uint32_t src1_u32[4] = {
498 src1,
499 (int64_t)src1 >> 32,
500 (int64_t)src1 >> 63,
501 (int64_t)src1 >> 63,
502 };
503 uint32_t prod_u32[4];
504 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
505 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
506 } else {
507 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
508 }
509 """)
510
511 # high 32-bits of unsigned integer multiply
512 binop("umul_high", tuint, commutative, """
513 if (bit_size == 64) {
514 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
515 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
516 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
517 uint32_t prod_u32[4];
518 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
519 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
520 } else {
521 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
522 }
523 """)
524
525 binop("fdiv", tfloat, "", "src0 / src1")
526 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
527 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
528
529 # returns a boolean representing the carry resulting from the addition of
530 # the two unsigned arguments.
531
532 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
533
534 # returns a boolean representing the borrow resulting from the subtraction
535 # of the two unsigned arguments.
536
537 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
538
539 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
540
541 # For signed integers, there are several different possible definitions of
542 # "modulus" or "remainder". We follow the conventions used by LLVM and
543 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
544 # operation while the imod opcode implements the more mathematical
545 # "modulus" operation. For details on the difference, see
546 #
547 # http://mathforum.org/library/drmath/view/52343.html
548
549 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
550 binop("imod", tint, "",
551 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
552 " src0 % src1 : src0 % src1 + src1)")
553 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
554 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
555
556 #
557 # Comparisons
558 #
559
560
561 # these integer-aware comparisons return a boolean (0 or ~0)
562
563 binop_compare("flt", tfloat, "", "src0 < src1")
564 binop_compare("fge", tfloat, "", "src0 >= src1")
565 binop_compare("feq", tfloat, commutative, "src0 == src1")
566 binop_compare("fne", tfloat, commutative, "src0 != src1")
567 binop_compare("ilt", tint, "", "src0 < src1")
568 binop_compare("ige", tint, "", "src0 >= src1")
569 binop_compare("ieq", tint, commutative, "src0 == src1")
570 binop_compare("ine", tint, commutative, "src0 != src1")
571 binop_compare("ult", tuint, "", "src0 < src1")
572 binop_compare("uge", tuint, "", "src0 >= src1")
573 binop_compare32("flt32", tfloat, "", "src0 < src1")
574 binop_compare32("fge32", tfloat, "", "src0 >= src1")
575 binop_compare32("feq32", tfloat, commutative, "src0 == src1")
576 binop_compare32("fne32", tfloat, commutative, "src0 != src1")
577 binop_compare32("ilt32", tint, "", "src0 < src1")
578 binop_compare32("ige32", tint, "", "src0 >= src1")
579 binop_compare32("ieq32", tint, commutative, "src0 == src1")
580 binop_compare32("ine32", tint, commutative, "src0 != src1")
581 binop_compare32("ult32", tuint, "", "src0 < src1")
582 binop_compare32("uge32", tuint, "", "src0 >= src1")
583
584 # integer-aware GLSL-style comparisons that compare floats and ints
585
586 binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}",
587 "{src0} && {src1}", "{src}")
588 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
589 "{src0} || {src1}", "{src}")
590 binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}",
591 "{src0} && {src1}", "{src}")
592 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
593 "{src0} || {src1}", "{src}")
594
595 binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}",
596 "{src0} && {src1}", "{src}")
597 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
598 "{src0} || {src1}", "{src}")
599 binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}",
600 "{src0} && {src1}", "{src}")
601 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
602 "{src0} || {src1}", "{src}")
603
604 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
605
606 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
607 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
608 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
609 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
610
611 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
612 # and false respectively
613
614 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
615 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
616 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
617 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
618
619 # SPIRV shifts are undefined for shift-operands >= bitsize,
620 # but SM5 shifts are defined to use the least significant bits, only
621 # The NIR definition is according to the SM5 specification.
622 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << (src1 & (sizeof(src0) * 8 - 1))")
623 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
624 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
625
626 # bitwise logic operators
627 #
628 # These are also used as boolean and, or, xor for hardware supporting
629 # integers.
630
631
632 binop("iand", tuint, commutative + associative, "src0 & src1")
633 binop("ior", tuint, commutative + associative, "src0 | src1")
634 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
635
636
637 # floating point logic operators
638 #
639 # These use (src != 0.0) for testing the truth of the input, and output 1.0
640 # for true and 0.0 for false
641
642 binop("fand", tfloat32, commutative,
643 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
644 binop("for", tfloat32, commutative,
645 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
646 binop("fxor", tfloat32, commutative,
647 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
648
649 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
650 "{src}")
651
652 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
653 "{src0} * {src1}", "{src0} + {src1}", "{src}")
654
655 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
656 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
657 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
658 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
659
660 binop("fmin", tfloat, "", "fminf(src0, src1)")
661 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
662 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
663 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
664 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
665 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
666
667 # Saturated vector add for 4 8bit ints.
668 binop("usadd_4x8", tint32, commutative + associative, """
669 dst = 0;
670 for (int i = 0; i < 32; i += 8) {
671 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
672 }
673 """)
674
675 # Saturated vector subtract for 4 8bit ints.
676 binop("ussub_4x8", tint32, "", """
677 dst = 0;
678 for (int i = 0; i < 32; i += 8) {
679 int src0_chan = (src0 >> i) & 0xff;
680 int src1_chan = (src1 >> i) & 0xff;
681 if (src0_chan > src1_chan)
682 dst |= (src0_chan - src1_chan) << i;
683 }
684 """)
685
686 # vector min for 4 8bit ints.
687 binop("umin_4x8", tint32, commutative + associative, """
688 dst = 0;
689 for (int i = 0; i < 32; i += 8) {
690 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
691 }
692 """)
693
694 # vector max for 4 8bit ints.
695 binop("umax_4x8", tint32, commutative + associative, """
696 dst = 0;
697 for (int i = 0; i < 32; i += 8) {
698 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
699 }
700 """)
701
702 # unorm multiply: (a * b) / 255.
703 binop("umul_unorm_4x8", tint32, commutative + associative, """
704 dst = 0;
705 for (int i = 0; i < 32; i += 8) {
706 int src0_chan = (src0 >> i) & 0xff;
707 int src1_chan = (src1 >> i) & 0xff;
708 dst |= ((src0_chan * src1_chan) / 255) << i;
709 }
710 """)
711
712 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
713
714 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
715 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
716
717 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
718 "src0 | ((uint64_t)src1 << 32)")
719
720 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
721 "src0 | ((uint32_t)src1 << 16)")
722
723 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
724 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
725 # if either of its arguments are 32.
726 binop_convert("bfm", tuint32, tint32, "", """
727 int bits = src0, offset = src1;
728 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
729 dst = 0; /* undefined */
730 else
731 dst = ((1u << bits) - 1) << offset;
732 """)
733
734 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
735 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
736 /* flush denormals to zero. */
737 if (!isnormal(dst))
738 dst = copysignf(0.0f, src0);
739 """)
740
741 # Combines the first component of each input to make a 2-component vector.
742
743 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
744 dst.x = src0.x;
745 dst.y = src1.x;
746 """)
747
748 # Byte extraction
749 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
750 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
751
752 # Word extraction
753 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
754 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
755
756
757 def triop(name, ty, const_expr):
758 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
759 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
760 opcode(name, output_size, tuint,
761 [src1_size, src2_size, src3_size],
762 [tuint, tuint, tuint], "", const_expr)
763
764 triop("ffma", tfloat, "src0 * src1 + src2")
765
766 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
767
768 # Conditional Select
769 #
770 # A vector conditional select instruction (like ?:, but operating per-
771 # component on vectors). There are two versions, one for floating point
772 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
773
774
775 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
776
777 # 3 way min/max/med
778 triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
779 triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
780 triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")
781
782 triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
783 triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
784 triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")
785
786 triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
787 triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
788 triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
789
790 opcode("bcsel", 0, tuint, [0, 0, 0],
791 [tbool1, tuint, tuint], "", "src0 ? src1 : src2")
792 opcode("b32csel", 0, tuint, [0, 0, 0],
793 [tbool32, tuint, tuint], "", "src0 ? src1 : src2")
794
795 # SM5 bfi assembly
796 triop("bfi", tuint32, """
797 unsigned mask = src0, insert = src1, base = src2;
798 if (mask == 0) {
799 dst = base;
800 } else {
801 unsigned tmp = mask;
802 while (!(tmp & 1)) {
803 tmp >>= 1;
804 insert <<= 1;
805 }
806 dst = (base & ~mask) | (insert & mask);
807 }
808 """)
809
810 # SM5 ubfe/ibfe assembly
811 opcode("ubfe", 0, tuint32,
812 [0, 0, 0], [tuint32, tint32, tint32], "", """
813 unsigned base = src0;
814 int offset = src1, bits = src2;
815 if (bits == 0) {
816 dst = 0;
817 } else if (bits < 0 || offset < 0) {
818 dst = 0; /* undefined */
819 } else if (offset + bits < 32) {
820 dst = (base << (32 - bits - offset)) >> (32 - bits);
821 } else {
822 dst = base >> offset;
823 }
824 """)
825 opcode("ibfe", 0, tint32,
826 [0, 0, 0], [tint32, tint32, tint32], "", """
827 int base = src0;
828 int offset = src1, bits = src2;
829 if (bits == 0) {
830 dst = 0;
831 } else if (bits < 0 || offset < 0) {
832 dst = 0; /* undefined */
833 } else if (offset + bits < 32) {
834 dst = (base << (32 - bits - offset)) >> (32 - bits);
835 } else {
836 dst = base >> offset;
837 }
838 """)
839
840 # GLSL bitfieldExtract()
841 opcode("ubitfield_extract", 0, tuint32,
842 [0, 0, 0], [tuint32, tint32, tint32], "", """
843 unsigned base = src0;
844 int offset = src1, bits = src2;
845 if (bits == 0) {
846 dst = 0;
847 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
848 dst = 0; /* undefined per the spec */
849 } else {
850 dst = (base >> offset) & ((1ull << bits) - 1);
851 }
852 """)
853 opcode("ibitfield_extract", 0, tint32,
854 [0, 0, 0], [tint32, tint32, tint32], "", """
855 int base = src0;
856 int offset = src1, bits = src2;
857 if (bits == 0) {
858 dst = 0;
859 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
860 dst = 0;
861 } else {
862 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
863 }
864 """)
865
866 # Combines the first component of each input to make a 3-component vector.
867
868 triop_horiz("vec3", 3, 1, 1, 1, """
869 dst.x = src0.x;
870 dst.y = src1.x;
871 dst.z = src2.x;
872 """)
873
874 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
875 src4_size, const_expr):
876 opcode(name, output_size, tuint,
877 [src1_size, src2_size, src3_size, src4_size],
878 [tuint, tuint, tuint, tuint],
879 "", const_expr)
880
881 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
882 [tuint32, tuint32, tint32, tint32], "", """
883 unsigned base = src0, insert = src1;
884 int offset = src2, bits = src3;
885 if (bits == 0) {
886 dst = base;
887 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
888 dst = 0;
889 } else {
890 unsigned mask = ((1ull << bits) - 1) << offset;
891 dst = (base & ~mask) | ((insert << offset) & mask);
892 }
893 """)
894
895 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
896 dst.x = src0.x;
897 dst.y = src1.x;
898 dst.z = src2.x;
899 dst.w = src3.x;
900 """)
901
902