nir: add frexp_exp and frexp_sig opcodes
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26
27 # Class that represents all the information we have about the opcode
28 # NOTE: this must be kept in sync with nir_op_info
29
30 class Opcode(object):
31 """Class that represents all the information we have about the opcode
32 NOTE: this must be kept in sync with nir_op_info
33 """
34 def __init__(self, name, output_size, output_type, input_sizes,
35 input_types, algebraic_properties, const_expr):
36 """Parameters:
37
38 - name is the name of the opcode (prepend nir_op_ for the enum name)
39 - all types are strings that get nir_type_ prepended to them
40 - input_types is a list of types
41 - algebraic_properties is a space-seperated string, where nir_op_is_ is
42 prepended before each entry
43 - const_expr is an expression or series of statements that computes the
44 constant value of the opcode given the constant values of its inputs.
45
46 Constant expressions are formed from the variables src0, src1, ...,
47 src(N-1), where N is the number of arguments. The output of the
48 expression should be stored in the dst variable. Per-component input
49 and output variables will be scalars and non-per-component input and
50 output variables will be a struct with fields named x, y, z, and w
51 all of the correct type. Input and output variables can be assumed
52 to already be of the correct type and need no conversion. In
53 particular, the conversion from the C bool type to/from NIR_TRUE and
54 NIR_FALSE happens automatically.
55
56 For per-component instructions, the entire expression will be
57 executed once for each component. For non-per-component
58 instructions, the expression is expected to store the correct values
59 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
60 constant expression, an assignment to dst will happen automatically
61 and the result will be equivalent to "dst = <expression>" for
62 per-component instructions and "dst.x = dst.y = ... = <expression>"
63 for non-per-component instructions.
64 """
65 assert isinstance(name, str)
66 assert isinstance(output_size, int)
67 assert isinstance(output_type, str)
68 assert isinstance(input_sizes, list)
69 assert isinstance(input_sizes[0], int)
70 assert isinstance(input_types, list)
71 assert isinstance(input_types[0], str)
72 assert isinstance(algebraic_properties, str)
73 assert isinstance(const_expr, str)
74 assert len(input_sizes) == len(input_types)
75 assert 0 <= output_size <= 4
76 for size in input_sizes:
77 assert 0 <= size <= 4
78 if output_size != 0:
79 assert size != 0
80 self.name = name
81 self.num_inputs = len(input_sizes)
82 self.output_size = output_size
83 self.output_type = output_type
84 self.input_sizes = input_sizes
85 self.input_types = input_types
86 self.algebraic_properties = algebraic_properties
87 self.const_expr = const_expr
88
89 # helper variables for strings
90 tfloat = "float"
91 tint = "int"
92 tbool = "bool32"
93 tuint = "uint"
94 tfloat32 = "float32"
95 tint32 = "int32"
96 tuint32 = "uint32"
97 tint64 = "int64"
98 tuint64 = "uint64"
99 tfloat64 = "float64"
100
101 commutative = "commutative "
102 associative = "associative "
103
104 # global dictionary of opcodes
105 opcodes = {}
106
107 def opcode(name, output_size, output_type, input_sizes, input_types,
108 algebraic_properties, const_expr):
109 assert name not in opcodes
110 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
111 input_types, algebraic_properties, const_expr)
112
113 def unop_convert(name, out_type, in_type, const_expr):
114 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
115
116 def unop(name, ty, const_expr):
117 opcode(name, 0, ty, [0], [ty], "", const_expr)
118
119 def unop_horiz(name, output_size, output_type, input_size, input_type,
120 const_expr):
121 opcode(name, output_size, output_type, [input_size], [input_type], "",
122 const_expr)
123
124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
125 reduce_expr, final_expr):
126 def prereduce(src):
127 return "(" + prereduce_expr.format(src=src) + ")"
128 def final(src):
129 return final_expr.format(src="(" + src + ")")
130 def reduce_(src0, src1):
131 return reduce_expr.format(src0=src0, src1=src1)
132 src0 = prereduce("src0.x")
133 src1 = prereduce("src0.y")
134 src2 = prereduce("src0.z")
135 src3 = prereduce("src0.w")
136 unop_horiz(name + "2", output_size, output_type, 2, input_type,
137 final(reduce_(src0, src1)))
138 unop_horiz(name + "3", output_size, output_type, 3, input_type,
139 final(reduce_(reduce_(src0, src1), src2)))
140 unop_horiz(name + "4", output_size, output_type, 4, input_type,
141 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
142
143
144 # These two move instructions differ in what modifiers they support and what
145 # the negate modifier means. Otherwise, they are identical.
146 unop("fmov", tfloat, "src0")
147 unop("imov", tint, "src0")
148
149 unop("ineg", tint, "-src0")
150 unop("fneg", tfloat, "-src0")
151 unop("inot", tint, "~src0") # invert every bit of the integer
152 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
153 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
154 unop("fsign", tfloat, ("bit_size == 64 ? " +
155 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
156 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
157 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
158 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
159 unop("fabs", tfloat, "fabs(src0)")
160 unop("fsat", tfloat, ("bit_size == 64 ? " +
161 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
162 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
163 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
164 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
165 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
166 unop("fexp2", tfloat, "exp2f(src0)")
167 unop("flog2", tfloat, "log2f(src0)")
168
169 # Generate all of the numeric conversion opcodes
170 for src_t in [tint, tuint, tfloat]:
171 if src_t in (tint, tuint):
172 dst_types = [tfloat, src_t]
173 elif src_t == tfloat:
174 dst_types = [tint, tuint, tfloat]
175
176 for dst_t in dst_types:
177 if dst_t == tfloat:
178 bit_sizes = [16, 32, 64]
179 else:
180 bit_sizes = [8, 16, 32, 64]
181 for bit_size in bit_sizes:
182 if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
183 rnd_modes = ['rtne', 'rtz', 'undef']
184 for rnd_mode in rnd_modes:
185 unop_convert("{0}2{1}{2}_{3}".format(src_t[0], dst_t[0],
186 bit_size, rnd_mode),
187 dst_t + str(bit_size), src_t, "src0")
188 else:
189 unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
190 dst_t + str(bit_size), src_t, "src0")
191
192 # We'll hand-code the to/from bool conversion opcodes. Because bool doesn't
193 # have multiple bit-sizes, we can always infer the size from the other type.
194 unop_convert("f2b", tbool, tfloat, "src0 != 0.0")
195 unop_convert("i2b", tbool, tint, "src0 != 0")
196 unop_convert("b2f", tfloat, tbool, "src0 ? 1.0 : 0.0")
197 unop_convert("b2i", tint, tbool, "src0 ? 1 : 0")
198
199
200 # Unary floating-point rounding operations.
201
202
203 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
204 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
205 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
206 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
207 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
208
209 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
210
211 # Trigonometric operations.
212
213
214 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
215 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
216
217 # dfrexp
218 unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);")
219 unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);")
220
221 # Partial derivatives.
222
223
224 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
225 unop("fddy", tfloat, "0.0")
226 unop("fddx_fine", tfloat, "0.0")
227 unop("fddy_fine", tfloat, "0.0")
228 unop("fddx_coarse", tfloat, "0.0")
229 unop("fddy_coarse", tfloat, "0.0")
230
231
232 # Floating point pack and unpack operations.
233
234 def pack_2x16(fmt):
235 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
236 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
237 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
238 """.replace("fmt", fmt))
239
240 def pack_4x8(fmt):
241 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
242 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
243 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
244 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
245 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
246 """.replace("fmt", fmt))
247
248 def unpack_2x16(fmt):
249 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
250 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
251 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
252 """.replace("fmt", fmt))
253
254 def unpack_4x8(fmt):
255 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
256 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
257 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
258 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
259 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
260 """.replace("fmt", fmt))
261
262
263 pack_2x16("snorm")
264 pack_4x8("snorm")
265 pack_2x16("unorm")
266 pack_4x8("unorm")
267 pack_2x16("half")
268 unpack_2x16("snorm")
269 unpack_4x8("snorm")
270 unpack_2x16("unorm")
271 unpack_4x8("unorm")
272 unpack_2x16("half")
273
274 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
275 dst.x = (src0.x & 0xffff) | (src0.y << 16);
276 """)
277
278 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
279 dst.x = (src0.x << 0) |
280 (src0.y << 8) |
281 (src0.z << 16) |
282 (src0.w << 24);
283 """)
284
285 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
286 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
287
288 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
289 "dst.x = src0.x; dst.y = src0.x >> 32;")
290
291 # Lowered floating point unpacking operations.
292
293
294 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
295 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
296 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
297 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
298
299 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
300 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
301
302 # Bit operations, part of ARB_gpu_shader5.
303
304
305 unop("bitfield_reverse", tuint32, """
306 /* we're not winning any awards for speed here, but that's ok */
307 dst = 0;
308 for (unsigned bit = 0; bit < 32; bit++)
309 dst |= ((src0 >> bit) & 1) << (31 - bit);
310 """)
311 unop("bit_count", tuint32, """
312 dst = 0;
313 for (unsigned bit = 0; bit < 32; bit++) {
314 if ((src0 >> bit) & 1)
315 dst++;
316 }
317 """)
318
319 unop_convert("ufind_msb", tint32, tuint32, """
320 dst = -1;
321 for (int bit = 31; bit >= 0; bit--) {
322 if ((src0 >> bit) & 1) {
323 dst = bit;
324 break;
325 }
326 }
327 """)
328
329 unop("ifind_msb", tint32, """
330 dst = -1;
331 for (int bit = 31; bit >= 0; bit--) {
332 /* If src0 < 0, we're looking for the first 0 bit.
333 * if src0 >= 0, we're looking for the first 1 bit.
334 */
335 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
336 (!((src0 >> bit) & 1) && (src0 < 0))) {
337 dst = bit;
338 break;
339 }
340 }
341 """)
342
343 unop("find_lsb", tint32, """
344 dst = -1;
345 for (unsigned bit = 0; bit < 32; bit++) {
346 if ((src0 >> bit) & 1) {
347 dst = bit;
348 break;
349 }
350 }
351 """)
352
353
354 for i in xrange(1, 5):
355 for j in xrange(1, 5):
356 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
357
358
359 # AMD_gcn_shader extended instructions
360 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
361 dst.x = dst.y = 0.0;
362 float absX = fabs(src0.x);
363 float absY = fabs(src0.y);
364 float absZ = fabs(src0.z);
365 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
366 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
367 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
368 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
369 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
370 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
371 """)
372
373 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
374 float absX = fabs(src0.x);
375 float absY = fabs(src0.y);
376 float absZ = fabs(src0.z);
377 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
378 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
379 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
380 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
381 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
382 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
383 """)
384
385
386 def binop_convert(name, out_type, in_type, alg_props, const_expr):
387 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
388
389 def binop(name, ty, alg_props, const_expr):
390 binop_convert(name, ty, ty, alg_props, const_expr)
391
392 def binop_compare(name, ty, alg_props, const_expr):
393 binop_convert(name, tbool, ty, alg_props, const_expr)
394
395 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
396 src2_type, const_expr):
397 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
398 "", const_expr)
399
400 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
401 reduce_expr, final_expr):
402 def final(src):
403 return final_expr.format(src= "(" + src + ")")
404 def reduce_(src0, src1):
405 return reduce_expr.format(src0=src0, src1=src1)
406 def prereduce(src0, src1):
407 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
408 src0 = prereduce("src0.x", "src1.x")
409 src1 = prereduce("src0.y", "src1.y")
410 src2 = prereduce("src0.z", "src1.z")
411 src3 = prereduce("src0.w", "src1.w")
412 opcode(name + "2", output_size, output_type,
413 [2, 2], [src_type, src_type], commutative,
414 final(reduce_(src0, src1)))
415 opcode(name + "3", output_size, output_type,
416 [3, 3], [src_type, src_type], commutative,
417 final(reduce_(reduce_(src0, src1), src2)))
418 opcode(name + "4", output_size, output_type,
419 [4, 4], [src_type, src_type], commutative,
420 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
421
422 binop("fadd", tfloat, commutative + associative, "src0 + src1")
423 binop("iadd", tint, commutative + associative, "src0 + src1")
424 binop("fsub", tfloat, "", "src0 - src1")
425 binop("isub", tint, "", "src0 - src1")
426
427 binop("fmul", tfloat, commutative + associative, "src0 * src1")
428 # low 32-bits of signed/unsigned integer multiply
429 binop("imul", tint, commutative + associative, "src0 * src1")
430 # high 32-bits of signed integer multiply
431 binop("imul_high", tint32, commutative,
432 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
433 # high 32-bits of unsigned integer multiply
434 binop("umul_high", tuint32, commutative,
435 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
436
437 binop("fdiv", tfloat, "", "src0 / src1")
438 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
439 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
440
441 # returns a boolean representing the carry resulting from the addition of
442 # the two unsigned arguments.
443
444 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
445
446 # returns a boolean representing the borrow resulting from the subtraction
447 # of the two unsigned arguments.
448
449 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
450
451 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
452
453 # For signed integers, there are several different possible definitions of
454 # "modulus" or "remainder". We follow the conventions used by LLVM and
455 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
456 # operation while the imod opcode implements the more mathematical
457 # "modulus" operation. For details on the difference, see
458 #
459 # http://mathforum.org/library/drmath/view/52343.html
460
461 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
462 binop("imod", tint, "",
463 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
464 " src0 % src1 : src0 % src1 + src1)")
465 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
466 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
467
468 #
469 # Comparisons
470 #
471
472
473 # these integer-aware comparisons return a boolean (0 or ~0)
474
475 binop_compare("flt", tfloat, "", "src0 < src1")
476 binop_compare("fge", tfloat, "", "src0 >= src1")
477 binop_compare("feq", tfloat, commutative, "src0 == src1")
478 binop_compare("fne", tfloat, commutative, "src0 != src1")
479 binop_compare("ilt", tint, "", "src0 < src1")
480 binop_compare("ige", tint, "", "src0 >= src1")
481 binop_compare("ieq", tint, commutative, "src0 == src1")
482 binop_compare("ine", tint, commutative, "src0 != src1")
483 binop_compare("ult", tuint, "", "src0 < src1")
484 binop_compare("uge", tuint, "", "src0 >= src1")
485
486 # integer-aware GLSL-style comparisons that compare floats and ints
487
488 binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}",
489 "{src0} && {src1}", "{src}")
490 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
491 "{src0} || {src1}", "{src}")
492 binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}",
493 "{src0} && {src1}", "{src}")
494 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
495 "{src0} || {src1}", "{src}")
496
497 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
498
499 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
500 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
501 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
502 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
503
504 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
505 # and false respectively
506
507 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
508 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
509 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
510 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
511
512
513 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
514 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
515 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
516
517 # bitwise logic operators
518 #
519 # These are also used as boolean and, or, xor for hardware supporting
520 # integers.
521
522
523 binop("iand", tuint, commutative + associative, "src0 & src1")
524 binop("ior", tuint, commutative + associative, "src0 | src1")
525 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
526
527
528 # floating point logic operators
529 #
530 # These use (src != 0.0) for testing the truth of the input, and output 1.0
531 # for true and 0.0 for false
532
533 binop("fand", tfloat32, commutative,
534 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
535 binop("for", tfloat32, commutative,
536 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
537 binop("fxor", tfloat32, commutative,
538 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
539
540 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
541 "{src}")
542
543 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
544 "{src0} * {src1}", "{src0} + {src1}", "{src}")
545
546 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
547 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
548 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
549 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
550
551 binop("fmin", tfloat, "", "fminf(src0, src1)")
552 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
553 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
554 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
555 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
556 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
557
558 # Saturated vector add for 4 8bit ints.
559 binop("usadd_4x8", tint32, commutative + associative, """
560 dst = 0;
561 for (int i = 0; i < 32; i += 8) {
562 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
563 }
564 """)
565
566 # Saturated vector subtract for 4 8bit ints.
567 binop("ussub_4x8", tint32, "", """
568 dst = 0;
569 for (int i = 0; i < 32; i += 8) {
570 int src0_chan = (src0 >> i) & 0xff;
571 int src1_chan = (src1 >> i) & 0xff;
572 if (src0_chan > src1_chan)
573 dst |= (src0_chan - src1_chan) << i;
574 }
575 """)
576
577 # vector min for 4 8bit ints.
578 binop("umin_4x8", tint32, commutative + associative, """
579 dst = 0;
580 for (int i = 0; i < 32; i += 8) {
581 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
582 }
583 """)
584
585 # vector max for 4 8bit ints.
586 binop("umax_4x8", tint32, commutative + associative, """
587 dst = 0;
588 for (int i = 0; i < 32; i += 8) {
589 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
590 }
591 """)
592
593 # unorm multiply: (a * b) / 255.
594 binop("umul_unorm_4x8", tint32, commutative + associative, """
595 dst = 0;
596 for (int i = 0; i < 32; i += 8) {
597 int src0_chan = (src0 >> i) & 0xff;
598 int src1_chan = (src1 >> i) & 0xff;
599 dst |= ((src0_chan * src1_chan) / 255) << i;
600 }
601 """)
602
603 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
604
605 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
606 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
607
608 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
609 "src0 | ((uint64_t)src1 << 32)")
610
611 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
612 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
613 # if either of its arguments are 32.
614 binop_convert("bfm", tuint32, tint32, "", """
615 int bits = src0, offset = src1;
616 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
617 dst = 0; /* undefined */
618 else
619 dst = ((1u << bits) - 1) << offset;
620 """)
621
622 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
623 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
624 /* flush denormals to zero. */
625 if (!isnormal(dst))
626 dst = copysignf(0.0f, src0);
627 """)
628
629 # Combines the first component of each input to make a 2-component vector.
630
631 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
632 dst.x = src0.x;
633 dst.y = src1.x;
634 """)
635
636 # Byte extraction
637 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
638 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
639
640 # Word extraction
641 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
642 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
643
644
645 def triop(name, ty, const_expr):
646 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
647 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
648 opcode(name, output_size, tuint,
649 [src1_size, src2_size, src3_size],
650 [tuint, tuint, tuint], "", const_expr)
651
652 triop("ffma", tfloat, "src0 * src1 + src2")
653
654 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
655
656 # Conditional Select
657 #
658 # A vector conditional select instruction (like ?:, but operating per-
659 # component on vectors). There are two versions, one for floating point
660 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
661
662
663 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
664 opcode("bcsel", 0, tuint, [0, 0, 0],
665 [tbool, tuint, tuint], "", "src0 ? src1 : src2")
666
667 # SM5 bfi assembly
668 triop("bfi", tuint32, """
669 unsigned mask = src0, insert = src1, base = src2;
670 if (mask == 0) {
671 dst = base;
672 } else {
673 unsigned tmp = mask;
674 while (!(tmp & 1)) {
675 tmp >>= 1;
676 insert <<= 1;
677 }
678 dst = (base & ~mask) | (insert & mask);
679 }
680 """)
681
682 # SM5 ubfe/ibfe assembly
683 opcode("ubfe", 0, tuint32,
684 [0, 0, 0], [tuint32, tint32, tint32], "", """
685 unsigned base = src0;
686 int offset = src1, bits = src2;
687 if (bits == 0) {
688 dst = 0;
689 } else if (bits < 0 || offset < 0) {
690 dst = 0; /* undefined */
691 } else if (offset + bits < 32) {
692 dst = (base << (32 - bits - offset)) >> (32 - bits);
693 } else {
694 dst = base >> offset;
695 }
696 """)
697 opcode("ibfe", 0, tint32,
698 [0, 0, 0], [tint32, tint32, tint32], "", """
699 int base = src0;
700 int offset = src1, bits = src2;
701 if (bits == 0) {
702 dst = 0;
703 } else if (bits < 0 || offset < 0) {
704 dst = 0; /* undefined */
705 } else if (offset + bits < 32) {
706 dst = (base << (32 - bits - offset)) >> (32 - bits);
707 } else {
708 dst = base >> offset;
709 }
710 """)
711
712 # GLSL bitfieldExtract()
713 opcode("ubitfield_extract", 0, tuint32,
714 [0, 0, 0], [tuint32, tint32, tint32], "", """
715 unsigned base = src0;
716 int offset = src1, bits = src2;
717 if (bits == 0) {
718 dst = 0;
719 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
720 dst = 0; /* undefined per the spec */
721 } else {
722 dst = (base >> offset) & ((1ull << bits) - 1);
723 }
724 """)
725 opcode("ibitfield_extract", 0, tint32,
726 [0, 0, 0], [tint32, tint32, tint32], "", """
727 int base = src0;
728 int offset = src1, bits = src2;
729 if (bits == 0) {
730 dst = 0;
731 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
732 dst = 0;
733 } else {
734 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
735 }
736 """)
737
738 # Combines the first component of each input to make a 3-component vector.
739
740 triop_horiz("vec3", 3, 1, 1, 1, """
741 dst.x = src0.x;
742 dst.y = src1.x;
743 dst.z = src2.x;
744 """)
745
746 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
747 src4_size, const_expr):
748 opcode(name, output_size, tuint,
749 [src1_size, src2_size, src3_size, src4_size],
750 [tuint, tuint, tuint, tuint],
751 "", const_expr)
752
753 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
754 [tuint32, tuint32, tint32, tint32], "", """
755 unsigned base = src0, insert = src1;
756 int offset = src2, bits = src3;
757 if (bits == 0) {
758 dst = base;
759 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
760 dst = 0;
761 } else {
762 unsigned mask = ((1ull << bits) - 1) << offset;
763 dst = (base & ~mask) | ((insert << offset) & mask);
764 }
765 """)
766
767 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
768 dst.x = src0.x;
769 dst.y = src1.x;
770 dst.z = src2.x;
771 dst.w = src3.x;
772 """)
773
774