nir: add opcode to perform int64 to bool conversions
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #! /usr/bin/env python
2 #
3 # Copyright (C) 2014 Connor Abbott
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining a
6 # copy of this software and associated documentation files (the "Software"),
7 # to deal in the Software without restriction, including without limitation
8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 # and/or sell copies of the Software, and to permit persons to whom the
10 # Software is furnished to do so, subject to the following conditions:
11 #
12 # The above copyright notice and this permission notice (including the next
13 # paragraph) shall be included in all copies or substantial portions of the
14 # Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 # IN THE SOFTWARE.
23 #
24 # Authors:
25 # Connor Abbott (cwabbott0@gmail.com)
26
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
46
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
56
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
65 """
66 assert isinstance(name, str)
67 assert isinstance(output_size, int)
68 assert isinstance(output_type, str)
69 assert isinstance(input_sizes, list)
70 assert isinstance(input_sizes[0], int)
71 assert isinstance(input_types, list)
72 assert isinstance(input_types[0], str)
73 assert isinstance(algebraic_properties, str)
74 assert isinstance(const_expr, str)
75 assert len(input_sizes) == len(input_types)
76 assert 0 <= output_size <= 4
77 for size in input_sizes:
78 assert 0 <= size <= 4
79 if output_size != 0:
80 assert size != 0
81 self.name = name
82 self.num_inputs = len(input_sizes)
83 self.output_size = output_size
84 self.output_type = output_type
85 self.input_sizes = input_sizes
86 self.input_types = input_types
87 self.algebraic_properties = algebraic_properties
88 self.const_expr = const_expr
89
90 # helper variables for strings
91 tfloat = "float"
92 tint = "int"
93 tbool = "bool32"
94 tuint = "uint"
95 tfloat32 = "float32"
96 tint32 = "int32"
97 tuint32 = "uint32"
98 tint64 = "int64"
99 tuint64 = "uint64"
100 tfloat64 = "float64"
101
102 commutative = "commutative "
103 associative = "associative "
104
105 # global dictionary of opcodes
106 opcodes = {}
107
108 def opcode(name, output_size, output_type, input_sizes, input_types,
109 algebraic_properties, const_expr):
110 assert name not in opcodes
111 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
112 input_types, algebraic_properties, const_expr)
113
114 def unop_convert(name, out_type, in_type, const_expr):
115 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
116
117 def unop(name, ty, const_expr):
118 opcode(name, 0, ty, [0], [ty], "", const_expr)
119
120 def unop_horiz(name, output_size, output_type, input_size, input_type,
121 const_expr):
122 opcode(name, output_size, output_type, [input_size], [input_type], "",
123 const_expr)
124
125 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
126 reduce_expr, final_expr):
127 def prereduce(src):
128 return "(" + prereduce_expr.format(src=src) + ")"
129 def final(src):
130 return final_expr.format(src="(" + src + ")")
131 def reduce_(src0, src1):
132 return reduce_expr.format(src0=src0, src1=src1)
133 src0 = prereduce("src0.x")
134 src1 = prereduce("src0.y")
135 src2 = prereduce("src0.z")
136 src3 = prereduce("src0.w")
137 unop_horiz(name + "2", output_size, output_type, 2, input_type,
138 final(reduce_(src0, src1)))
139 unop_horiz(name + "3", output_size, output_type, 3, input_type,
140 final(reduce_(reduce_(src0, src1), src2)))
141 unop_horiz(name + "4", output_size, output_type, 4, input_type,
142 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
143
144
145 # These two move instructions differ in what modifiers they support and what
146 # the negate modifier means. Otherwise, they are identical.
147 unop("fmov", tfloat, "src0")
148 unop("imov", tint, "src0")
149
150 unop("ineg", tint, "-src0")
151 unop("fneg", tfloat, "-src0")
152 unop("inot", tint, "~src0") # invert every bit of the integer
153 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
154 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
155 unop("fsign", tfloat, ("bit_size == 64 ? " +
156 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
157 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
158 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
159 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
160 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
161 unop("fsat", tfloat, ("bit_size == 64 ? " +
162 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
163 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
164 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
165 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
166 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
167 unop("fexp2", tfloat, "exp2f(src0)")
168 unop("flog2", tfloat, "log2f(src0)")
169 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
170 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
171 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
172 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
173 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
174 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
175 unop_convert("i2i32", tint32, tint, "src0") # General int (int8_t, int64_t, etc.) to int32_t conversion
176 unop_convert("u2i32", tint32, tuint, "src0") # General uint (uint8_t, uint64_t, etc.) to int32_t conversion
177 unop_convert("i2u32", tuint32, tint, "src0") # General int (int8_t, int64_t, etc.) to uint32_t conversion
178 unop_convert("u2u32", tuint32, tuint, "src0") # General uint (uint8_t, uint64_t, etc.) to uint32_t conversion
179 unop_convert("i2i64", tint64, tint, "src0") # General int (int8_t, int32_t, etc.) to int64_t conversion
180 unop_convert("u2i64", tint64, tuint, "src0") # General uint (uint8_t, uint64_t, etc.) to int64_t conversion
181 unop_convert("f2i64", tint64, tfloat, "src0") # General float (float or double) to int64_t conversion
182 unop_convert("i2u64", tuint64, tint, "src0") # General int (int8_t, int64_t, etc.) to uint64_t conversion
183 unop_convert("u2u64", tuint64, tuint, "src0") # General uint (uint8_t, uint32_t, etc.) to uint64_t conversion
184 unop_convert("f2u64", tuint64, tfloat, "src0") # General float (float or double) to uint64_t conversion
185 unop_convert("i642f", tfloat32, tint64, "src0") # int64_t-to-float conversion.
186 unop_convert("i642b", tbool, tint64, "src0") # int64_t-to-bool conversion.
187 unop_convert("i642d", tfloat64, tint64, "src0") # int64_t-to-double conversion.
188 unop_convert("u642f", tfloat32, tuint64, "src0") # uint64_t-to-float conversion.
189 unop_convert("u642d", tfloat64, tuint64, "src0") # uint64_t-to-double conversion.
190
191 # Float-to-boolean conversion
192 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
193 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
194 # Boolean-to-float conversion
195 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
196 # Int-to-boolean conversion
197 unop_convert("i2b", tbool, tint, "src0 != 0")
198 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
199 unop_convert("b2i64", tint64, tbool, "src0 ? 1 : 0") # Boolean-to-int64_t conversion.
200 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
201 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
202 # double-to-float conversion
203 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
204 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
205
206 # Unary floating-point rounding operations.
207
208
209 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
210 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
211 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
212 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
213 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
214
215 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
216
217 # Trigonometric operations.
218
219
220 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
221 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
222
223
224 # Partial derivatives.
225
226
227 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
228 unop("fddy", tfloat, "0.0")
229 unop("fddx_fine", tfloat, "0.0")
230 unop("fddy_fine", tfloat, "0.0")
231 unop("fddx_coarse", tfloat, "0.0")
232 unop("fddy_coarse", tfloat, "0.0")
233
234
235 # Floating point pack and unpack operations.
236
237 def pack_2x16(fmt):
238 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
239 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
240 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
241 """.replace("fmt", fmt))
242
243 def pack_4x8(fmt):
244 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
245 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
246 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
247 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
248 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
249 """.replace("fmt", fmt))
250
251 def unpack_2x16(fmt):
252 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
253 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
254 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
255 """.replace("fmt", fmt))
256
257 def unpack_4x8(fmt):
258 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
259 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
260 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
261 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
262 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
263 """.replace("fmt", fmt))
264
265
266 pack_2x16("snorm")
267 pack_4x8("snorm")
268 pack_2x16("unorm")
269 pack_4x8("unorm")
270 pack_2x16("half")
271 unpack_2x16("snorm")
272 unpack_4x8("snorm")
273 unpack_2x16("unorm")
274 unpack_4x8("unorm")
275 unpack_2x16("half")
276
277 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
278 dst.x = (src0.x & 0xffff) | (src0.y << 16);
279 """)
280
281 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
282 dst.x = (src0.x << 0) |
283 (src0.y << 8) |
284 (src0.z << 16) |
285 (src0.w << 24);
286 """)
287
288 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
289 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
290
291 unop_horiz("pack_int_2x32", 1, tint64, 2, tint32,
292 "dst.x = src0.x | ((int64_t)src0.y << 32);")
293
294 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
295 "dst.x = src0.x; dst.y = src0.x >> 32;")
296
297 unop_horiz("unpack_int_2x32", 2, tint32, 1, tint64,
298 "dst.x = src0.x; dst.y = src0.x >> 32;")
299
300 # Lowered floating point unpacking operations.
301
302
303 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
304 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
305 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
306 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
307
308 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
309 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
310 unop_convert("unpack_int_2x32_split_x", tuint32, tuint64, "src0")
311 unop_convert("unpack_int_2x32_split_y", tuint32, tuint64, "src0 >> 32")
312
313 # Bit operations, part of ARB_gpu_shader5.
314
315
316 unop("bitfield_reverse", tuint32, """
317 /* we're not winning any awards for speed here, but that's ok */
318 dst = 0;
319 for (unsigned bit = 0; bit < 32; bit++)
320 dst |= ((src0 >> bit) & 1) << (31 - bit);
321 """)
322 unop("bit_count", tuint32, """
323 dst = 0;
324 for (unsigned bit = 0; bit < 32; bit++) {
325 if ((src0 >> bit) & 1)
326 dst++;
327 }
328 """)
329
330 unop_convert("ufind_msb", tint32, tuint32, """
331 dst = -1;
332 for (int bit = 31; bit > 0; bit--) {
333 if ((src0 >> bit) & 1) {
334 dst = bit;
335 break;
336 }
337 }
338 """)
339
340 unop("ifind_msb", tint32, """
341 dst = -1;
342 for (int bit = 31; bit >= 0; bit--) {
343 /* If src0 < 0, we're looking for the first 0 bit.
344 * if src0 >= 0, we're looking for the first 1 bit.
345 */
346 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
347 (!((src0 >> bit) & 1) && (src0 < 0))) {
348 dst = bit;
349 break;
350 }
351 }
352 """)
353
354 unop("find_lsb", tint32, """
355 dst = -1;
356 for (unsigned bit = 0; bit < 32; bit++) {
357 if ((src0 >> bit) & 1) {
358 dst = bit;
359 break;
360 }
361 }
362 """)
363
364
365 for i in xrange(1, 5):
366 for j in xrange(1, 5):
367 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
368
369 def binop_convert(name, out_type, in_type, alg_props, const_expr):
370 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
371
372 def binop(name, ty, alg_props, const_expr):
373 binop_convert(name, ty, ty, alg_props, const_expr)
374
375 def binop_compare(name, ty, alg_props, const_expr):
376 binop_convert(name, tbool, ty, alg_props, const_expr)
377
378 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
379 src2_type, const_expr):
380 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
381 "", const_expr)
382
383 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
384 reduce_expr, final_expr):
385 def final(src):
386 return final_expr.format(src= "(" + src + ")")
387 def reduce_(src0, src1):
388 return reduce_expr.format(src0=src0, src1=src1)
389 def prereduce(src0, src1):
390 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
391 src0 = prereduce("src0.x", "src1.x")
392 src1 = prereduce("src0.y", "src1.y")
393 src2 = prereduce("src0.z", "src1.z")
394 src3 = prereduce("src0.w", "src1.w")
395 opcode(name + "2", output_size, output_type,
396 [2, 2], [src_type, src_type], commutative,
397 final(reduce_(src0, src1)))
398 opcode(name + "3", output_size, output_type,
399 [3, 3], [src_type, src_type], commutative,
400 final(reduce_(reduce_(src0, src1), src2)))
401 opcode(name + "4", output_size, output_type,
402 [4, 4], [src_type, src_type], commutative,
403 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
404
405 binop("fadd", tfloat, commutative + associative, "src0 + src1")
406 binop("iadd", tint, commutative + associative, "src0 + src1")
407 binop("fsub", tfloat, "", "src0 - src1")
408 binop("isub", tint, "", "src0 - src1")
409
410 binop("fmul", tfloat, commutative + associative, "src0 * src1")
411 # low 32-bits of signed/unsigned integer multiply
412 binop("imul", tint, commutative + associative, "src0 * src1")
413 # high 32-bits of signed integer multiply
414 binop("imul_high", tint32, commutative,
415 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
416 # high 32-bits of unsigned integer multiply
417 binop("umul_high", tuint32, commutative,
418 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
419
420 binop("fdiv", tfloat, "", "src0 / src1")
421 binop("idiv", tint, "", "src0 / src1")
422 binop("udiv", tuint, "", "src0 / src1")
423
424 # returns a boolean representing the carry resulting from the addition of
425 # the two unsigned arguments.
426
427 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
428
429 # returns a boolean representing the borrow resulting from the subtraction
430 # of the two unsigned arguments.
431
432 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
433
434 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
435
436 # For signed integers, there are several different possible definitions of
437 # "modulus" or "remainder". We follow the conventions used by LLVM and
438 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
439 # operation while the imod opcode implements the more mathematical
440 # "modulus" operation. For details on the difference, see
441 #
442 # http://mathforum.org/library/drmath/view/52343.html
443
444 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
445 binop("imod", tint, "",
446 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
447 " src0 % src1 : src0 % src1 + src1)")
448 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
449 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
450
451 #
452 # Comparisons
453 #
454
455
456 # these integer-aware comparisons return a boolean (0 or ~0)
457
458 binop_compare("flt", tfloat, "", "src0 < src1")
459 binop_compare("fge", tfloat, "", "src0 >= src1")
460 binop_compare("feq", tfloat, commutative, "src0 == src1")
461 binop_compare("fne", tfloat, commutative, "src0 != src1")
462 binop_compare("ilt", tint, "", "src0 < src1")
463 binop_compare("ige", tint, "", "src0 >= src1")
464 binop_compare("ieq", tint, commutative, "src0 == src1")
465 binop_compare("ine", tint, commutative, "src0 != src1")
466 binop_compare("ult", tuint, "", "src0 < src1")
467 binop_compare("uge", tuint, "", "src0 >= src1")
468
469 # integer-aware GLSL-style comparisons that compare floats and ints
470
471 binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}",
472 "{src0} && {src1}", "{src}")
473 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
474 "{src0} || {src1}", "{src}")
475 binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}",
476 "{src0} && {src1}", "{src}")
477 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
478 "{src0} || {src1}", "{src}")
479
480 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
481
482 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
483 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
484 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
485 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
486
487 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
488 # and false respectively
489
490 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
491 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
492 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
493 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
494
495
496 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
497 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
498 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
499
500 # bitwise logic operators
501 #
502 # These are also used as boolean and, or, xor for hardware supporting
503 # integers.
504
505
506 binop("iand", tuint, commutative + associative, "src0 & src1")
507 binop("ior", tuint, commutative + associative, "src0 | src1")
508 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
509
510
511 # floating point logic operators
512 #
513 # These use (src != 0.0) for testing the truth of the input, and output 1.0
514 # for true and 0.0 for false
515
516 binop("fand", tfloat32, commutative,
517 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
518 binop("for", tfloat32, commutative,
519 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
520 binop("fxor", tfloat32, commutative,
521 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
522
523 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
524 "{src}")
525
526 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
527 "{src0} * {src1}", "{src0} + {src1}", "{src}")
528
529 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
530 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
531 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
532 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
533
534 binop("fmin", tfloat, "", "fminf(src0, src1)")
535 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
536 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
537 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
538 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
539 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
540
541 # Saturated vector add for 4 8bit ints.
542 binop("usadd_4x8", tint32, commutative + associative, """
543 dst = 0;
544 for (int i = 0; i < 32; i += 8) {
545 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
546 }
547 """)
548
549 # Saturated vector subtract for 4 8bit ints.
550 binop("ussub_4x8", tint32, "", """
551 dst = 0;
552 for (int i = 0; i < 32; i += 8) {
553 int src0_chan = (src0 >> i) & 0xff;
554 int src1_chan = (src1 >> i) & 0xff;
555 if (src0_chan > src1_chan)
556 dst |= (src0_chan - src1_chan) << i;
557 }
558 """)
559
560 # vector min for 4 8bit ints.
561 binop("umin_4x8", tint32, commutative + associative, """
562 dst = 0;
563 for (int i = 0; i < 32; i += 8) {
564 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
565 }
566 """)
567
568 # vector max for 4 8bit ints.
569 binop("umax_4x8", tint32, commutative + associative, """
570 dst = 0;
571 for (int i = 0; i < 32; i += 8) {
572 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
573 }
574 """)
575
576 # unorm multiply: (a * b) / 255.
577 binop("umul_unorm_4x8", tint32, commutative + associative, """
578 dst = 0;
579 for (int i = 0; i < 32; i += 8) {
580 int src0_chan = (src0 >> i) & 0xff;
581 int src1_chan = (src1 >> i) & 0xff;
582 dst |= ((src0_chan * src1_chan) / 255) << i;
583 }
584 """)
585
586 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
587
588 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
589 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
590
591 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
592 "src0 | ((uint64_t)src1 << 32)")
593
594 binop_convert("pack_int_2x32_split", tuint64, tuint32, "",
595 "src0 | ((uint64_t)src1 << 32)")
596
597 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
598 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
599 # if either of its arguments are 32.
600 binop_convert("bfm", tuint32, tint32, "", """
601 int bits = src0, offset = src1;
602 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
603 dst = 0; /* undefined */
604 else
605 dst = ((1u << bits) - 1) << offset;
606 """)
607
608 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
609 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
610 /* flush denormals to zero. */
611 if (!isnormal(dst))
612 dst = copysignf(0.0f, src0);
613 """)
614
615 # Combines the first component of each input to make a 2-component vector.
616
617 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
618 dst.x = src0.x;
619 dst.y = src1.x;
620 """)
621
622 # Byte extraction
623 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
624 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
625
626 # Word extraction
627 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
628 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
629
630
631 def triop(name, ty, const_expr):
632 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
633 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
634 opcode(name, output_size, tuint,
635 [src1_size, src2_size, src3_size],
636 [tuint, tuint, tuint], "", const_expr)
637
638 triop("ffma", tfloat, "src0 * src1 + src2")
639
640 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
641
642 # Conditional Select
643 #
644 # A vector conditional select instruction (like ?:, but operating per-
645 # component on vectors). There are two versions, one for floating point
646 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
647
648
649 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
650 opcode("bcsel", 0, tuint, [0, 0, 0],
651 [tbool, tuint, tuint], "", "src0 ? src1 : src2")
652
653 # SM5 bfi assembly
654 triop("bfi", tuint32, """
655 unsigned mask = src0, insert = src1, base = src2;
656 if (mask == 0) {
657 dst = base;
658 } else {
659 unsigned tmp = mask;
660 while (!(tmp & 1)) {
661 tmp >>= 1;
662 insert <<= 1;
663 }
664 dst = (base & ~mask) | (insert & mask);
665 }
666 """)
667
668 # SM5 ubfe/ibfe assembly
669 opcode("ubfe", 0, tuint32,
670 [0, 0, 0], [tuint32, tint32, tint32], "", """
671 unsigned base = src0;
672 int offset = src1, bits = src2;
673 if (bits == 0) {
674 dst = 0;
675 } else if (bits < 0 || offset < 0) {
676 dst = 0; /* undefined */
677 } else if (offset + bits < 32) {
678 dst = (base << (32 - bits - offset)) >> (32 - bits);
679 } else {
680 dst = base >> offset;
681 }
682 """)
683 opcode("ibfe", 0, tint32,
684 [0, 0, 0], [tint32, tint32, tint32], "", """
685 int base = src0;
686 int offset = src1, bits = src2;
687 if (bits == 0) {
688 dst = 0;
689 } else if (bits < 0 || offset < 0) {
690 dst = 0; /* undefined */
691 } else if (offset + bits < 32) {
692 dst = (base << (32 - bits - offset)) >> (32 - bits);
693 } else {
694 dst = base >> offset;
695 }
696 """)
697
698 # GLSL bitfieldExtract()
699 opcode("ubitfield_extract", 0, tuint32,
700 [0, 0, 0], [tuint32, tint32, tint32], "", """
701 unsigned base = src0;
702 int offset = src1, bits = src2;
703 if (bits == 0) {
704 dst = 0;
705 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
706 dst = 0; /* undefined per the spec */
707 } else {
708 dst = (base >> offset) & ((1ull << bits) - 1);
709 }
710 """)
711 opcode("ibitfield_extract", 0, tint32,
712 [0, 0, 0], [tint32, tint32, tint32], "", """
713 int base = src0;
714 int offset = src1, bits = src2;
715 if (bits == 0) {
716 dst = 0;
717 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
718 dst = 0;
719 } else {
720 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
721 }
722 """)
723
724 # Combines the first component of each input to make a 3-component vector.
725
726 triop_horiz("vec3", 3, 1, 1, 1, """
727 dst.x = src0.x;
728 dst.y = src1.x;
729 dst.z = src2.x;
730 """)
731
732 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
733 src4_size, const_expr):
734 opcode(name, output_size, tuint,
735 [src1_size, src2_size, src3_size, src4_size],
736 [tuint, tuint, tuint, tuint],
737 "", const_expr)
738
739 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
740 [tuint32, tuint32, tint32, tint32], "", """
741 unsigned base = src0, insert = src1;
742 int offset = src2, bits = src3;
743 if (bits == 0) {
744 dst = 0;
745 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
746 dst = 0;
747 } else {
748 unsigned mask = ((1ull << bits) - 1) << offset;
749 dst = (base & ~mask) | ((insert << bits) & mask);
750 }
751 """)
752
753 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
754 dst.x = src0.x;
755 dst.y = src1.x;
756 dst.z = src2.x;
757 dst.w = src3.x;
758 """)
759
760