nir: sge operation is defined for floating-point types
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26
27 # Class that represents all the information we have about the opcode
28 # NOTE: this must be kept in sync with nir_op_info
29
30 class Opcode(object):
31 """Class that represents all the information we have about the opcode
32 NOTE: this must be kept in sync with nir_op_info
33 """
34 def __init__(self, name, output_size, output_type, input_sizes,
35 input_types, algebraic_properties, const_expr):
36 """Parameters:
37
38 - name is the name of the opcode (prepend nir_op_ for the enum name)
39 - all types are strings that get nir_type_ prepended to them
40 - input_types is a list of types
41 - algebraic_properties is a space-seperated string, where nir_op_is_ is
42 prepended before each entry
43 - const_expr is an expression or series of statements that computes the
44 constant value of the opcode given the constant values of its inputs.
45
46 Constant expressions are formed from the variables src0, src1, ...,
47 src(N-1), where N is the number of arguments. The output of the
48 expression should be stored in the dst variable. Per-component input
49 and output variables will be scalars and non-per-component input and
50 output variables will be a struct with fields named x, y, z, and w
51 all of the correct type. Input and output variables can be assumed
52 to already be of the correct type and need no conversion. In
53 particular, the conversion from the C bool type to/from NIR_TRUE and
54 NIR_FALSE happens automatically.
55
56 For per-component instructions, the entire expression will be
57 executed once for each component. For non-per-component
58 instructions, the expression is expected to store the correct values
59 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
60 constant expression, an assignment to dst will happen automatically
61 and the result will be equivalent to "dst = <expression>" for
62 per-component instructions and "dst.x = dst.y = ... = <expression>"
63 for non-per-component instructions.
64 """
65 assert isinstance(name, str)
66 assert isinstance(output_size, int)
67 assert isinstance(output_type, str)
68 assert isinstance(input_sizes, list)
69 assert isinstance(input_sizes[0], int)
70 assert isinstance(input_types, list)
71 assert isinstance(input_types[0], str)
72 assert isinstance(algebraic_properties, str)
73 assert isinstance(const_expr, str)
74 assert len(input_sizes) == len(input_types)
75 assert 0 <= output_size <= 4
76 for size in input_sizes:
77 assert 0 <= size <= 4
78 if output_size != 0:
79 assert size != 0
80 self.name = name
81 self.num_inputs = len(input_sizes)
82 self.output_size = output_size
83 self.output_type = output_type
84 self.input_sizes = input_sizes
85 self.input_types = input_types
86 self.algebraic_properties = algebraic_properties
87 self.const_expr = const_expr
88
89 # helper variables for strings
90 tfloat = "float"
91 tint = "int"
92 tbool = "bool32"
93 tuint = "uint"
94 tfloat32 = "float32"
95 tint32 = "int32"
96 tuint32 = "uint32"
97 tint64 = "int64"
98 tuint64 = "uint64"
99 tfloat64 = "float64"
100
101 commutative = "commutative "
102 associative = "associative "
103
104 # global dictionary of opcodes
105 opcodes = {}
106
107 def opcode(name, output_size, output_type, input_sizes, input_types,
108 algebraic_properties, const_expr):
109 assert name not in opcodes
110 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
111 input_types, algebraic_properties, const_expr)
112
113 def unop_convert(name, out_type, in_type, const_expr):
114 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
115
116 def unop(name, ty, const_expr):
117 opcode(name, 0, ty, [0], [ty], "", const_expr)
118
119 def unop_horiz(name, output_size, output_type, input_size, input_type,
120 const_expr):
121 opcode(name, output_size, output_type, [input_size], [input_type], "",
122 const_expr)
123
124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
125 reduce_expr, final_expr):
126 def prereduce(src):
127 return "(" + prereduce_expr.format(src=src) + ")"
128 def final(src):
129 return final_expr.format(src="(" + src + ")")
130 def reduce_(src0, src1):
131 return reduce_expr.format(src0=src0, src1=src1)
132 src0 = prereduce("src0.x")
133 src1 = prereduce("src0.y")
134 src2 = prereduce("src0.z")
135 src3 = prereduce("src0.w")
136 unop_horiz(name + "2", output_size, output_type, 2, input_type,
137 final(reduce_(src0, src1)))
138 unop_horiz(name + "3", output_size, output_type, 3, input_type,
139 final(reduce_(reduce_(src0, src1), src2)))
140 unop_horiz(name + "4", output_size, output_type, 4, input_type,
141 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
142
143
144 # These two move instructions differ in what modifiers they support and what
145 # the negate modifier means. Otherwise, they are identical.
146 unop("fmov", tfloat, "src0")
147 unop("imov", tint, "src0")
148
149 unop("ineg", tint, "-src0")
150 unop("fneg", tfloat, "-src0")
151 unop("inot", tint, "~src0") # invert every bit of the integer
152 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
153 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
154 unop("fsign", tfloat, ("bit_size == 64 ? " +
155 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
156 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
157 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
158 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
159 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
160 unop("fsat", tfloat, ("bit_size == 64 ? " +
161 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
162 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
163 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
164 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
165 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
166 unop("fexp2", tfloat, "exp2f(src0)")
167 unop("flog2", tfloat, "log2f(src0)")
168
169 # Generate all of the numeric conversion opcodes
170 for src_t in [tint, tuint, tfloat]:
171 if src_t in (tint, tuint):
172 dst_types = [tfloat, src_t]
173 elif src_t == tfloat:
174 dst_types = [tint, tuint, tfloat]
175
176 for dst_t in dst_types:
177 if dst_t == tfloat:
178 bit_sizes = [16, 32, 64]
179 else:
180 bit_sizes = [8, 16, 32, 64]
181 for bit_size in bit_sizes:
182 unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
183 dst_t + str(bit_size), src_t, "src0")
184
185 # We'll hand-code the to/from bool conversion opcodes. Because bool doesn't
186 # have multiple bit-sizes, we can always infer the size from the other type.
187 unop_convert("f2b", tbool, tfloat, "src0 != 0.0")
188 unop_convert("i2b", tbool, tint, "src0 != 0")
189 unop_convert("b2f", tfloat, tbool, "src0 ? 1.0 : 0.0")
190 unop_convert("b2i", tint, tbool, "src0 ? 1 : 0")
191
192
193 # Unary floating-point rounding operations.
194
195
196 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
197 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
198 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
199 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
200 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
201
202 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
203
204 # Trigonometric operations.
205
206
207 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
208 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
209
210
211 # Partial derivatives.
212
213
214 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
215 unop("fddy", tfloat, "0.0")
216 unop("fddx_fine", tfloat, "0.0")
217 unop("fddy_fine", tfloat, "0.0")
218 unop("fddx_coarse", tfloat, "0.0")
219 unop("fddy_coarse", tfloat, "0.0")
220
221
222 # Floating point pack and unpack operations.
223
224 def pack_2x16(fmt):
225 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
226 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
227 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
228 """.replace("fmt", fmt))
229
230 def pack_4x8(fmt):
231 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
232 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
233 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
234 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
235 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
236 """.replace("fmt", fmt))
237
238 def unpack_2x16(fmt):
239 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
240 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
241 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
242 """.replace("fmt", fmt))
243
244 def unpack_4x8(fmt):
245 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
246 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
247 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
248 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
249 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
250 """.replace("fmt", fmt))
251
252
253 pack_2x16("snorm")
254 pack_4x8("snorm")
255 pack_2x16("unorm")
256 pack_4x8("unorm")
257 pack_2x16("half")
258 unpack_2x16("snorm")
259 unpack_4x8("snorm")
260 unpack_2x16("unorm")
261 unpack_4x8("unorm")
262 unpack_2x16("half")
263
264 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
265 dst.x = (src0.x & 0xffff) | (src0.y << 16);
266 """)
267
268 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
269 dst.x = (src0.x << 0) |
270 (src0.y << 8) |
271 (src0.z << 16) |
272 (src0.w << 24);
273 """)
274
275 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
276 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
277
278 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
279 "dst.x = src0.x; dst.y = src0.x >> 32;")
280
281 # Lowered floating point unpacking operations.
282
283
284 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
285 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
286 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
287 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
288
289 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
290 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
291
292 # Bit operations, part of ARB_gpu_shader5.
293
294
295 unop("bitfield_reverse", tuint32, """
296 /* we're not winning any awards for speed here, but that's ok */
297 dst = 0;
298 for (unsigned bit = 0; bit < 32; bit++)
299 dst |= ((src0 >> bit) & 1) << (31 - bit);
300 """)
301 unop("bit_count", tuint32, """
302 dst = 0;
303 for (unsigned bit = 0; bit < 32; bit++) {
304 if ((src0 >> bit) & 1)
305 dst++;
306 }
307 """)
308
309 unop_convert("ufind_msb", tint32, tuint32, """
310 dst = -1;
311 for (int bit = 31; bit > 0; bit--) {
312 if ((src0 >> bit) & 1) {
313 dst = bit;
314 break;
315 }
316 }
317 """)
318
319 unop("ifind_msb", tint32, """
320 dst = -1;
321 for (int bit = 31; bit >= 0; bit--) {
322 /* If src0 < 0, we're looking for the first 0 bit.
323 * if src0 >= 0, we're looking for the first 1 bit.
324 */
325 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
326 (!((src0 >> bit) & 1) && (src0 < 0))) {
327 dst = bit;
328 break;
329 }
330 }
331 """)
332
333 unop("find_lsb", tint32, """
334 dst = -1;
335 for (unsigned bit = 0; bit < 32; bit++) {
336 if ((src0 >> bit) & 1) {
337 dst = bit;
338 break;
339 }
340 }
341 """)
342
343
344 for i in xrange(1, 5):
345 for j in xrange(1, 5):
346 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
347
348 def binop_convert(name, out_type, in_type, alg_props, const_expr):
349 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
350
351 def binop(name, ty, alg_props, const_expr):
352 binop_convert(name, ty, ty, alg_props, const_expr)
353
354 def binop_compare(name, ty, alg_props, const_expr):
355 binop_convert(name, tbool, ty, alg_props, const_expr)
356
357 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
358 src2_type, const_expr):
359 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
360 "", const_expr)
361
362 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
363 reduce_expr, final_expr):
364 def final(src):
365 return final_expr.format(src= "(" + src + ")")
366 def reduce_(src0, src1):
367 return reduce_expr.format(src0=src0, src1=src1)
368 def prereduce(src0, src1):
369 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
370 src0 = prereduce("src0.x", "src1.x")
371 src1 = prereduce("src0.y", "src1.y")
372 src2 = prereduce("src0.z", "src1.z")
373 src3 = prereduce("src0.w", "src1.w")
374 opcode(name + "2", output_size, output_type,
375 [2, 2], [src_type, src_type], commutative,
376 final(reduce_(src0, src1)))
377 opcode(name + "3", output_size, output_type,
378 [3, 3], [src_type, src_type], commutative,
379 final(reduce_(reduce_(src0, src1), src2)))
380 opcode(name + "4", output_size, output_type,
381 [4, 4], [src_type, src_type], commutative,
382 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
383
384 binop("fadd", tfloat, commutative + associative, "src0 + src1")
385 binop("iadd", tint, commutative + associative, "src0 + src1")
386 binop("fsub", tfloat, "", "src0 - src1")
387 binop("isub", tint, "", "src0 - src1")
388
389 binop("fmul", tfloat, commutative + associative, "src0 * src1")
390 # low 32-bits of signed/unsigned integer multiply
391 binop("imul", tint, commutative + associative, "src0 * src1")
392 # high 32-bits of signed integer multiply
393 binop("imul_high", tint32, commutative,
394 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
395 # high 32-bits of unsigned integer multiply
396 binop("umul_high", tuint32, commutative,
397 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
398
399 binop("fdiv", tfloat, "", "src0 / src1")
400 binop("idiv", tint, "", "src0 / src1")
401 binop("udiv", tuint, "", "src0 / src1")
402
403 # returns a boolean representing the carry resulting from the addition of
404 # the two unsigned arguments.
405
406 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
407
408 # returns a boolean representing the borrow resulting from the subtraction
409 # of the two unsigned arguments.
410
411 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
412
413 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
414
415 # For signed integers, there are several different possible definitions of
416 # "modulus" or "remainder". We follow the conventions used by LLVM and
417 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
418 # operation while the imod opcode implements the more mathematical
419 # "modulus" operation. For details on the difference, see
420 #
421 # http://mathforum.org/library/drmath/view/52343.html
422
423 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
424 binop("imod", tint, "",
425 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
426 " src0 % src1 : src0 % src1 + src1)")
427 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
428 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
429
430 #
431 # Comparisons
432 #
433
434
435 # these integer-aware comparisons return a boolean (0 or ~0)
436
437 binop_compare("flt", tfloat, "", "src0 < src1")
438 binop_compare("fge", tfloat, "", "src0 >= src1")
439 binop_compare("feq", tfloat, commutative, "src0 == src1")
440 binop_compare("fne", tfloat, commutative, "src0 != src1")
441 binop_compare("ilt", tint, "", "src0 < src1")
442 binop_compare("ige", tint, "", "src0 >= src1")
443 binop_compare("ieq", tint, commutative, "src0 == src1")
444 binop_compare("ine", tint, commutative, "src0 != src1")
445 binop_compare("ult", tuint, "", "src0 < src1")
446 binop_compare("uge", tuint, "", "src0 >= src1")
447
448 # integer-aware GLSL-style comparisons that compare floats and ints
449
450 binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}",
451 "{src0} && {src1}", "{src}")
452 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
453 "{src0} || {src1}", "{src}")
454 binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}",
455 "{src0} && {src1}", "{src}")
456 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
457 "{src0} || {src1}", "{src}")
458
459 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
460
461 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
462 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
463 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
464 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
465
466 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
467 # and false respectively
468
469 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
470 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
471 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
472 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
473
474
475 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
476 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
477 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
478
479 # bitwise logic operators
480 #
481 # These are also used as boolean and, or, xor for hardware supporting
482 # integers.
483
484
485 binop("iand", tuint, commutative + associative, "src0 & src1")
486 binop("ior", tuint, commutative + associative, "src0 | src1")
487 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
488
489
490 # floating point logic operators
491 #
492 # These use (src != 0.0) for testing the truth of the input, and output 1.0
493 # for true and 0.0 for false
494
495 binop("fand", tfloat32, commutative,
496 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
497 binop("for", tfloat32, commutative,
498 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
499 binop("fxor", tfloat32, commutative,
500 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
501
502 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
503 "{src}")
504
505 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
506 "{src0} * {src1}", "{src0} + {src1}", "{src}")
507
508 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
509 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
510 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
511 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
512
513 binop("fmin", tfloat, "", "fminf(src0, src1)")
514 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
515 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
516 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
517 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
518 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
519
520 # Saturated vector add for 4 8bit ints.
521 binop("usadd_4x8", tint32, commutative + associative, """
522 dst = 0;
523 for (int i = 0; i < 32; i += 8) {
524 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
525 }
526 """)
527
528 # Saturated vector subtract for 4 8bit ints.
529 binop("ussub_4x8", tint32, "", """
530 dst = 0;
531 for (int i = 0; i < 32; i += 8) {
532 int src0_chan = (src0 >> i) & 0xff;
533 int src1_chan = (src1 >> i) & 0xff;
534 if (src0_chan > src1_chan)
535 dst |= (src0_chan - src1_chan) << i;
536 }
537 """)
538
539 # vector min for 4 8bit ints.
540 binop("umin_4x8", tint32, commutative + associative, """
541 dst = 0;
542 for (int i = 0; i < 32; i += 8) {
543 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
544 }
545 """)
546
547 # vector max for 4 8bit ints.
548 binop("umax_4x8", tint32, commutative + associative, """
549 dst = 0;
550 for (int i = 0; i < 32; i += 8) {
551 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
552 }
553 """)
554
555 # unorm multiply: (a * b) / 255.
556 binop("umul_unorm_4x8", tint32, commutative + associative, """
557 dst = 0;
558 for (int i = 0; i < 32; i += 8) {
559 int src0_chan = (src0 >> i) & 0xff;
560 int src1_chan = (src1 >> i) & 0xff;
561 dst |= ((src0_chan * src1_chan) / 255) << i;
562 }
563 """)
564
565 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
566
567 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
568 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
569
570 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
571 "src0 | ((uint64_t)src1 << 32)")
572
573 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
574 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
575 # if either of its arguments are 32.
576 binop_convert("bfm", tuint32, tint32, "", """
577 int bits = src0, offset = src1;
578 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
579 dst = 0; /* undefined */
580 else
581 dst = ((1u << bits) - 1) << offset;
582 """)
583
584 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
585 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
586 /* flush denormals to zero. */
587 if (!isnormal(dst))
588 dst = copysignf(0.0f, src0);
589 """)
590
591 # Combines the first component of each input to make a 2-component vector.
592
593 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
594 dst.x = src0.x;
595 dst.y = src1.x;
596 """)
597
598 # Byte extraction
599 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
600 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
601
602 # Word extraction
603 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
604 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
605
606
607 def triop(name, ty, const_expr):
608 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
609 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
610 opcode(name, output_size, tuint,
611 [src1_size, src2_size, src3_size],
612 [tuint, tuint, tuint], "", const_expr)
613
614 triop("ffma", tfloat, "src0 * src1 + src2")
615
616 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
617
618 # Conditional Select
619 #
620 # A vector conditional select instruction (like ?:, but operating per-
621 # component on vectors). There are two versions, one for floating point
622 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
623
624
625 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
626 opcode("bcsel", 0, tuint, [0, 0, 0],
627 [tbool, tuint, tuint], "", "src0 ? src1 : src2")
628
629 # SM5 bfi assembly
630 triop("bfi", tuint32, """
631 unsigned mask = src0, insert = src1, base = src2;
632 if (mask == 0) {
633 dst = base;
634 } else {
635 unsigned tmp = mask;
636 while (!(tmp & 1)) {
637 tmp >>= 1;
638 insert <<= 1;
639 }
640 dst = (base & ~mask) | (insert & mask);
641 }
642 """)
643
644 # SM5 ubfe/ibfe assembly
645 opcode("ubfe", 0, tuint32,
646 [0, 0, 0], [tuint32, tint32, tint32], "", """
647 unsigned base = src0;
648 int offset = src1, bits = src2;
649 if (bits == 0) {
650 dst = 0;
651 } else if (bits < 0 || offset < 0) {
652 dst = 0; /* undefined */
653 } else if (offset + bits < 32) {
654 dst = (base << (32 - bits - offset)) >> (32 - bits);
655 } else {
656 dst = base >> offset;
657 }
658 """)
659 opcode("ibfe", 0, tint32,
660 [0, 0, 0], [tint32, tint32, tint32], "", """
661 int base = src0;
662 int offset = src1, bits = src2;
663 if (bits == 0) {
664 dst = 0;
665 } else if (bits < 0 || offset < 0) {
666 dst = 0; /* undefined */
667 } else if (offset + bits < 32) {
668 dst = (base << (32 - bits - offset)) >> (32 - bits);
669 } else {
670 dst = base >> offset;
671 }
672 """)
673
674 # GLSL bitfieldExtract()
675 opcode("ubitfield_extract", 0, tuint32,
676 [0, 0, 0], [tuint32, tint32, tint32], "", """
677 unsigned base = src0;
678 int offset = src1, bits = src2;
679 if (bits == 0) {
680 dst = 0;
681 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
682 dst = 0; /* undefined per the spec */
683 } else {
684 dst = (base >> offset) & ((1ull << bits) - 1);
685 }
686 """)
687 opcode("ibitfield_extract", 0, tint32,
688 [0, 0, 0], [tint32, tint32, tint32], "", """
689 int base = src0;
690 int offset = src1, bits = src2;
691 if (bits == 0) {
692 dst = 0;
693 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
694 dst = 0;
695 } else {
696 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
697 }
698 """)
699
700 # Combines the first component of each input to make a 3-component vector.
701
702 triop_horiz("vec3", 3, 1, 1, 1, """
703 dst.x = src0.x;
704 dst.y = src1.x;
705 dst.z = src2.x;
706 """)
707
708 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
709 src4_size, const_expr):
710 opcode(name, output_size, tuint,
711 [src1_size, src2_size, src3_size, src4_size],
712 [tuint, tuint, tuint, tuint],
713 "", const_expr)
714
715 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
716 [tuint32, tuint32, tint32, tint32], "", """
717 unsigned base = src0, insert = src1;
718 int offset = src2, bits = src3;
719 if (bits == 0) {
720 dst = 0;
721 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
722 dst = 0;
723 } else {
724 unsigned mask = ((1ull << bits) - 1) << offset;
725 dst = (base & ~mask) | ((insert << bits) & mask);
726 }
727 """)
728
729 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
730 dst.x = src0.x;
731 dst.y = src1.x;
732 dst.z = src2.x;
733 dst.w = src3.x;
734 """)
735
736