nir: fix comment typo about f2d/d2f
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #! /usr/bin/env python
2 #
3 # Copyright (C) 2014 Connor Abbott
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining a
6 # copy of this software and associated documentation files (the "Software"),
7 # to deal in the Software without restriction, including without limitation
8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 # and/or sell copies of the Software, and to permit persons to whom the
10 # Software is furnished to do so, subject to the following conditions:
11 #
12 # The above copyright notice and this permission notice (including the next
13 # paragraph) shall be included in all copies or substantial portions of the
14 # Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 # IN THE SOFTWARE.
23 #
24 # Authors:
25 # Connor Abbott (cwabbott0@gmail.com)
26
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
46
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
56
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
65 """
66 assert isinstance(name, str)
67 assert isinstance(output_size, int)
68 assert isinstance(output_type, str)
69 assert isinstance(input_sizes, list)
70 assert isinstance(input_sizes[0], int)
71 assert isinstance(input_types, list)
72 assert isinstance(input_types[0], str)
73 assert isinstance(algebraic_properties, str)
74 assert isinstance(const_expr, str)
75 assert len(input_sizes) == len(input_types)
76 assert 0 <= output_size <= 4
77 for size in input_sizes:
78 assert 0 <= size <= 4
79 if output_size != 0:
80 assert size != 0
81 self.name = name
82 self.num_inputs = len(input_sizes)
83 self.output_size = output_size
84 self.output_type = output_type
85 self.input_sizes = input_sizes
86 self.input_types = input_types
87 self.algebraic_properties = algebraic_properties
88 self.const_expr = const_expr
89
90 # helper variables for strings
91 tfloat = "float"
92 tint = "int"
93 tbool = "bool32"
94 tuint = "uint"
95 tfloat32 = "float32"
96 tint32 = "int32"
97 tuint32 = "uint32"
98 tuint64 = "uint64"
99 tfloat64 = "float64"
100
101 commutative = "commutative "
102 associative = "associative "
103
104 # global dictionary of opcodes
105 opcodes = {}
106
107 def opcode(name, output_size, output_type, input_sizes, input_types,
108 algebraic_properties, const_expr):
109 assert name not in opcodes
110 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
111 input_types, algebraic_properties, const_expr)
112
113 def unop_convert(name, out_type, in_type, const_expr):
114 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
115
116 def unop(name, ty, const_expr):
117 opcode(name, 0, ty, [0], [ty], "", const_expr)
118
119 def unop_horiz(name, output_size, output_type, input_size, input_type,
120 const_expr):
121 opcode(name, output_size, output_type, [input_size], [input_type], "",
122 const_expr)
123
124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
125 reduce_expr, final_expr):
126 def prereduce(src):
127 return "(" + prereduce_expr.format(src=src) + ")"
128 def final(src):
129 return final_expr.format(src="(" + src + ")")
130 def reduce_(src0, src1):
131 return reduce_expr.format(src0=src0, src1=src1)
132 src0 = prereduce("src0.x")
133 src1 = prereduce("src0.y")
134 src2 = prereduce("src0.z")
135 src3 = prereduce("src0.w")
136 unop_horiz(name + "2", output_size, output_type, 2, input_type,
137 final(reduce_(src0, src1)))
138 unop_horiz(name + "3", output_size, output_type, 3, input_type,
139 final(reduce_(reduce_(src0, src1), src2)))
140 unop_horiz(name + "4", output_size, output_type, 4, input_type,
141 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
142
143
144 # These two move instructions differ in what modifiers they support and what
145 # the negate modifier means. Otherwise, they are identical.
146 unop("fmov", tfloat, "src0")
147 unop("imov", tint, "src0")
148
149 unop("ineg", tint, "-src0")
150 unop("fneg", tfloat, "-src0")
151 unop("inot", tint, "~src0") # invert every bit of the integer
152 unop("fnot", tfloat, "(src0 == 0.0f) ? 1.0f : 0.0f")
153 unop("fsign", tfloat, "(src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f)")
154 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
155 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
156 unop("fabs", tfloat, "fabsf(src0)")
157 unop("fsat", tfloat, "(src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0)")
158 unop("frcp", tfloat, "1.0f / src0")
159 unop("frsq", tfloat, "1.0f / sqrtf(src0)")
160 unop("fsqrt", tfloat, "sqrtf(src0)")
161 unop("fexp2", tfloat, "exp2f(src0)")
162 unop("flog2", tfloat, "log2f(src0)")
163 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
164 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
165 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
166 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
167 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
168 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
169 # Float-to-boolean conversion
170 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
171 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
172 # Boolean-to-float conversion
173 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
174 # Int-to-boolean conversion
175 unop_convert("i2b", tbool, tint32, "src0 != 0")
176 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
177 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
178 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
179 # double-to-float conversion
180 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
181 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
182
183 # Unary floating-point rounding operations.
184
185
186 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
187 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
188 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
189 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
190 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
191
192 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
193
194 # Trigonometric operations.
195
196
197 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
198 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
199
200
201 # Partial derivatives.
202
203
204 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
205 unop("fddy", tfloat, "0.0")
206 unop("fddx_fine", tfloat, "0.0")
207 unop("fddy_fine", tfloat, "0.0")
208 unop("fddx_coarse", tfloat, "0.0")
209 unop("fddy_coarse", tfloat, "0.0")
210
211
212 # Floating point pack and unpack operations.
213
214 def pack_2x16(fmt):
215 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
216 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
217 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
218 """.replace("fmt", fmt))
219
220 def pack_4x8(fmt):
221 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
222 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
223 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
224 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
225 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
226 """.replace("fmt", fmt))
227
228 def unpack_2x16(fmt):
229 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
230 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
231 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
232 """.replace("fmt", fmt))
233
234 def unpack_4x8(fmt):
235 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
236 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
237 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
238 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
239 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
240 """.replace("fmt", fmt))
241
242
243 pack_2x16("snorm")
244 pack_4x8("snorm")
245 pack_2x16("unorm")
246 pack_4x8("unorm")
247 pack_2x16("half")
248 unpack_2x16("snorm")
249 unpack_4x8("snorm")
250 unpack_2x16("unorm")
251 unpack_4x8("unorm")
252 unpack_2x16("half")
253
254 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
255 dst.x = (src0.x & 0xffff) | (src0.y >> 16);
256 """)
257
258 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
259 dst.x = (src0.x << 0) |
260 (src0.y << 8) |
261 (src0.z << 16) |
262 (src0.w << 24);
263 """)
264
265 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
266 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
267
268 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
269 "dst.x = src0.x; dst.y = src0.x >> 32;")
270
271 # Lowered floating point unpacking operations.
272
273
274 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
275 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
276 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
277 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
278
279 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
280 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
281
282 # Bit operations, part of ARB_gpu_shader5.
283
284
285 unop("bitfield_reverse", tuint32, """
286 /* we're not winning any awards for speed here, but that's ok */
287 dst = 0;
288 for (unsigned bit = 0; bit < 32; bit++)
289 dst |= ((src0 >> bit) & 1) << (31 - bit);
290 """)
291 unop("bit_count", tuint32, """
292 dst = 0;
293 for (unsigned bit = 0; bit < 32; bit++) {
294 if ((src0 >> bit) & 1)
295 dst++;
296 }
297 """)
298
299 unop_convert("ufind_msb", tint32, tuint32, """
300 dst = -1;
301 for (int bit = 31; bit > 0; bit--) {
302 if ((src0 >> bit) & 1) {
303 dst = bit;
304 break;
305 }
306 }
307 """)
308
309 unop("ifind_msb", tint32, """
310 dst = -1;
311 for (int bit = 31; bit >= 0; bit--) {
312 /* If src0 < 0, we're looking for the first 0 bit.
313 * if src0 >= 0, we're looking for the first 1 bit.
314 */
315 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
316 (!((src0 >> bit) & 1) && (src0 < 0))) {
317 dst = bit;
318 break;
319 }
320 }
321 """)
322
323 unop("find_lsb", tint32, """
324 dst = -1;
325 for (unsigned bit = 0; bit < 32; bit++) {
326 if ((src0 >> bit) & 1) {
327 dst = bit;
328 break;
329 }
330 }
331 """)
332
333
334 for i in xrange(1, 5):
335 for j in xrange(1, 5):
336 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
337
338 def binop_convert(name, out_type, in_type, alg_props, const_expr):
339 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
340
341 def binop(name, ty, alg_props, const_expr):
342 binop_convert(name, ty, ty, alg_props, const_expr)
343
344 def binop_compare(name, ty, alg_props, const_expr):
345 binop_convert(name, tbool, ty, alg_props, const_expr)
346
347 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
348 src2_type, const_expr):
349 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
350 "", const_expr)
351
352 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
353 reduce_expr, final_expr):
354 def final(src):
355 return final_expr.format(src= "(" + src + ")")
356 def reduce_(src0, src1):
357 return reduce_expr.format(src0=src0, src1=src1)
358 def prereduce(src0, src1):
359 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
360 src0 = prereduce("src0.x", "src1.x")
361 src1 = prereduce("src0.y", "src1.y")
362 src2 = prereduce("src0.z", "src1.z")
363 src3 = prereduce("src0.w", "src1.w")
364 opcode(name + "2", output_size, output_type,
365 [2, 2], [src_type, src_type], commutative,
366 final(reduce_(src0, src1)))
367 opcode(name + "3", output_size, output_type,
368 [3, 3], [src_type, src_type], commutative,
369 final(reduce_(reduce_(src0, src1), src2)))
370 opcode(name + "4", output_size, output_type,
371 [4, 4], [src_type, src_type], commutative,
372 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
373
374 binop("fadd", tfloat, commutative + associative, "src0 + src1")
375 binop("iadd", tint, commutative + associative, "src0 + src1")
376 binop("fsub", tfloat, "", "src0 - src1")
377 binop("isub", tint, "", "src0 - src1")
378
379 binop("fmul", tfloat, commutative + associative, "src0 * src1")
380 # low 32-bits of signed/unsigned integer multiply
381 binop("imul", tint, commutative + associative, "src0 * src1")
382 # high 32-bits of signed integer multiply
383 binop("imul_high", tint32, commutative,
384 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
385 # high 32-bits of unsigned integer multiply
386 binop("umul_high", tuint32, commutative,
387 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
388
389 binop("fdiv", tfloat, "", "src0 / src1")
390 binop("idiv", tint, "", "src0 / src1")
391 binop("udiv", tuint, "", "src0 / src1")
392
393 # returns a boolean representing the carry resulting from the addition of
394 # the two unsigned arguments.
395
396 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
397
398 # returns a boolean representing the borrow resulting from the subtraction
399 # of the two unsigned arguments.
400
401 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
402
403 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
404
405 # For signed integers, there are several different possible definitions of
406 # "modulus" or "remainder". We follow the conventions used by LLVM and
407 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
408 # operation while the imod opcode implements the more mathematical
409 # "modulus" operation. For details on the difference, see
410 #
411 # http://mathforum.org/library/drmath/view/52343.html
412
413 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
414 binop("imod", tint, "",
415 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
416 " src0 % src1 : src0 % src1 + src1)")
417 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
418 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
419
420 #
421 # Comparisons
422 #
423
424
425 # these integer-aware comparisons return a boolean (0 or ~0)
426
427 binop_compare("flt", tfloat, "", "src0 < src1")
428 binop_compare("fge", tfloat, "", "src0 >= src1")
429 binop_compare("feq", tfloat, commutative, "src0 == src1")
430 binop_compare("fne", tfloat, commutative, "src0 != src1")
431 binop_compare("ilt", tint, "", "src0 < src1")
432 binop_compare("ige", tint, "", "src0 >= src1")
433 binop_compare("ieq", tint, commutative, "src0 == src1")
434 binop_compare("ine", tint, commutative, "src0 != src1")
435 binop_compare("ult", tuint, "", "src0 < src1")
436 binop_compare("uge", tuint, "", "src0 >= src1")
437
438 # integer-aware GLSL-style comparisons that compare floats and ints
439
440 binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}",
441 "{src0} && {src1}", "{src}")
442 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
443 "{src0} || {src1}", "{src}")
444 binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}",
445 "{src0} && {src1}", "{src}")
446 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
447 "{src0} || {src1}", "{src}")
448
449 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
450
451 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
452 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
453 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
454 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
455
456 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
457 # and false respectively
458
459 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
460 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
461 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
462 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
463
464
465 binop("ishl", tint, "", "src0 << src1")
466 binop("ishr", tint, "", "src0 >> src1")
467 binop("ushr", tuint, "", "src0 >> src1")
468
469 # bitwise logic operators
470 #
471 # These are also used as boolean and, or, xor for hardware supporting
472 # integers.
473
474
475 binop("iand", tuint, commutative + associative, "src0 & src1")
476 binop("ior", tuint, commutative + associative, "src0 | src1")
477 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
478
479
480 # floating point logic operators
481 #
482 # These use (src != 0.0) for testing the truth of the input, and output 1.0
483 # for true and 0.0 for false
484
485 binop("fand", tfloat32, commutative,
486 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
487 binop("for", tfloat32, commutative,
488 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
489 binop("fxor", tfloat32, commutative,
490 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
491
492 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
493 "{src}")
494
495 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
496 "{src0} * {src1}", "{src0} + {src1}", "{src}")
497
498 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
499 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
500 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
501 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
502
503 binop("fmin", tfloat, "", "fminf(src0, src1)")
504 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
505 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
506 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
507 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
508 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
509
510 # Saturated vector add for 4 8bit ints.
511 binop("usadd_4x8", tint32, commutative + associative, """
512 dst = 0;
513 for (int i = 0; i < 32; i += 8) {
514 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
515 }
516 """)
517
518 # Saturated vector subtract for 4 8bit ints.
519 binop("ussub_4x8", tint32, "", """
520 dst = 0;
521 for (int i = 0; i < 32; i += 8) {
522 int src0_chan = (src0 >> i) & 0xff;
523 int src1_chan = (src1 >> i) & 0xff;
524 if (src0_chan > src1_chan)
525 dst |= (src0_chan - src1_chan) << i;
526 }
527 """)
528
529 # vector min for 4 8bit ints.
530 binop("umin_4x8", tint32, commutative + associative, """
531 dst = 0;
532 for (int i = 0; i < 32; i += 8) {
533 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
534 }
535 """)
536
537 # vector max for 4 8bit ints.
538 binop("umax_4x8", tint32, commutative + associative, """
539 dst = 0;
540 for (int i = 0; i < 32; i += 8) {
541 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
542 }
543 """)
544
545 # unorm multiply: (a * b) / 255.
546 binop("umul_unorm_4x8", tint32, commutative + associative, """
547 dst = 0;
548 for (int i = 0; i < 32; i += 8) {
549 int src0_chan = (src0 >> i) & 0xff;
550 int src1_chan = (src1 >> i) & 0xff;
551 dst |= ((src0_chan * src1_chan) / 255) << i;
552 }
553 """)
554
555 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
556
557 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
558 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
559
560 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
561 "src0 | ((uint64_t)src1 << 32)")
562
563 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
564 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
565 # if either of its arguments are 32.
566 binop_convert("bfm", tuint32, tint32, "", """
567 int bits = src0, offset = src1;
568 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
569 dst = 0; /* undefined */
570 else
571 dst = ((1u << bits) - 1) << offset;
572 """)
573
574 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
575 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
576 /* flush denormals to zero. */
577 if (!isnormal(dst))
578 dst = copysignf(0.0f, src0);
579 """)
580
581 # Combines the first component of each input to make a 2-component vector.
582
583 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
584 dst.x = src0.x;
585 dst.y = src1.x;
586 """)
587
588 # Byte extraction
589 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
590 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
591
592 # Word extraction
593 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
594 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
595
596
597 def triop(name, ty, const_expr):
598 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
599 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
600 opcode(name, output_size, tuint,
601 [src1_size, src2_size, src3_size],
602 [tuint, tuint, tuint], "", const_expr)
603
604 triop("ffma", tfloat, "src0 * src1 + src2")
605
606 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
607
608 # Conditional Select
609 #
610 # A vector conditional select instruction (like ?:, but operating per-
611 # component on vectors). There are two versions, one for floating point
612 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
613
614
615 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
616 opcode("bcsel", 0, tuint, [0, 0, 0],
617 [tbool, tuint, tuint], "", "src0 ? src1 : src2")
618
619 # SM5 bfi assembly
620 triop("bfi", tuint32, """
621 unsigned mask = src0, insert = src1, base = src2;
622 if (mask == 0) {
623 dst = base;
624 } else {
625 unsigned tmp = mask;
626 while (!(tmp & 1)) {
627 tmp >>= 1;
628 insert <<= 1;
629 }
630 dst = (base & ~mask) | (insert & mask);
631 }
632 """)
633
634 # SM5 ubfe/ibfe assembly
635 opcode("ubfe", 0, tuint32,
636 [0, 0, 0], [tuint32, tint32, tint32], "", """
637 unsigned base = src0;
638 int offset = src1, bits = src2;
639 if (bits == 0) {
640 dst = 0;
641 } else if (bits < 0 || offset < 0) {
642 dst = 0; /* undefined */
643 } else if (offset + bits < 32) {
644 dst = (base << (32 - bits - offset)) >> (32 - bits);
645 } else {
646 dst = base >> offset;
647 }
648 """)
649 opcode("ibfe", 0, tint32,
650 [0, 0, 0], [tint32, tint32, tint32], "", """
651 int base = src0;
652 int offset = src1, bits = src2;
653 if (bits == 0) {
654 dst = 0;
655 } else if (bits < 0 || offset < 0) {
656 dst = 0; /* undefined */
657 } else if (offset + bits < 32) {
658 dst = (base << (32 - bits - offset)) >> (32 - bits);
659 } else {
660 dst = base >> offset;
661 }
662 """)
663
664 # GLSL bitfieldExtract()
665 opcode("ubitfield_extract", 0, tuint32,
666 [0, 0, 0], [tuint32, tint32, tint32], "", """
667 unsigned base = src0;
668 int offset = src1, bits = src2;
669 if (bits == 0) {
670 dst = 0;
671 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
672 dst = 0; /* undefined per the spec */
673 } else {
674 dst = (base >> offset) & ((1ull << bits) - 1);
675 }
676 """)
677 opcode("ibitfield_extract", 0, tint32,
678 [0, 0, 0], [tint32, tint32, tint32], "", """
679 int base = src0;
680 int offset = src1, bits = src2;
681 if (bits == 0) {
682 dst = 0;
683 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
684 dst = 0;
685 } else {
686 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
687 }
688 """)
689
690 # Combines the first component of each input to make a 3-component vector.
691
692 triop_horiz("vec3", 3, 1, 1, 1, """
693 dst.x = src0.x;
694 dst.y = src1.x;
695 dst.z = src2.x;
696 """)
697
698 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
699 src4_size, const_expr):
700 opcode(name, output_size, tuint,
701 [src1_size, src2_size, src3_size, src4_size],
702 [tuint, tuint, tuint, tuint],
703 "", const_expr)
704
705 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
706 [tuint32, tuint32, tint32, tint32], "", """
707 unsigned base = src0, insert = src1;
708 int offset = src2, bits = src3;
709 if (bits == 0) {
710 dst = 0;
711 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
712 dst = 0;
713 } else {
714 unsigned mask = ((1ull << bits) - 1) << offset;
715 dst = (base & ~mask) | ((insert << bits) & mask);
716 }
717 """)
718
719 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
720 dst.x = src0.x;
721 dst.y = src1.x;
722 dst.z = src2.x;
723 dst.w = src3.x;
724 """)
725
726