nir: fix definition of pack_uvec2_to_uint
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #! /usr/bin/env python
2 #
3 # Copyright (C) 2014 Connor Abbott
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining a
6 # copy of this software and associated documentation files (the "Software"),
7 # to deal in the Software without restriction, including without limitation
8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 # and/or sell copies of the Software, and to permit persons to whom the
10 # Software is furnished to do so, subject to the following conditions:
11 #
12 # The above copyright notice and this permission notice (including the next
13 # paragraph) shall be included in all copies or substantial portions of the
14 # Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 # IN THE SOFTWARE.
23 #
24 # Authors:
25 # Connor Abbott (cwabbott0@gmail.com)
26
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
46
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
56
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
65 """
66 assert isinstance(name, str)
67 assert isinstance(output_size, int)
68 assert isinstance(output_type, str)
69 assert isinstance(input_sizes, list)
70 assert isinstance(input_sizes[0], int)
71 assert isinstance(input_types, list)
72 assert isinstance(input_types[0], str)
73 assert isinstance(algebraic_properties, str)
74 assert isinstance(const_expr, str)
75 assert len(input_sizes) == len(input_types)
76 assert 0 <= output_size <= 4
77 for size in input_sizes:
78 assert 0 <= size <= 4
79 if output_size != 0:
80 assert size != 0
81 self.name = name
82 self.num_inputs = len(input_sizes)
83 self.output_size = output_size
84 self.output_type = output_type
85 self.input_sizes = input_sizes
86 self.input_types = input_types
87 self.algebraic_properties = algebraic_properties
88 self.const_expr = const_expr
89
90 # helper variables for strings
91 tfloat = "float"
92 tint = "int"
93 tbool = "bool32"
94 tuint = "uint"
95 tfloat32 = "float32"
96 tint32 = "int32"
97 tuint32 = "uint32"
98 tuint64 = "uint64"
99 tfloat64 = "float64"
100
101 commutative = "commutative "
102 associative = "associative "
103
104 # global dictionary of opcodes
105 opcodes = {}
106
107 def opcode(name, output_size, output_type, input_sizes, input_types,
108 algebraic_properties, const_expr):
109 assert name not in opcodes
110 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
111 input_types, algebraic_properties, const_expr)
112
113 def unop_convert(name, out_type, in_type, const_expr):
114 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
115
116 def unop(name, ty, const_expr):
117 opcode(name, 0, ty, [0], [ty], "", const_expr)
118
119 def unop_horiz(name, output_size, output_type, input_size, input_type,
120 const_expr):
121 opcode(name, output_size, output_type, [input_size], [input_type], "",
122 const_expr)
123
124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
125 reduce_expr, final_expr):
126 def prereduce(src):
127 return "(" + prereduce_expr.format(src=src) + ")"
128 def final(src):
129 return final_expr.format(src="(" + src + ")")
130 def reduce_(src0, src1):
131 return reduce_expr.format(src0=src0, src1=src1)
132 src0 = prereduce("src0.x")
133 src1 = prereduce("src0.y")
134 src2 = prereduce("src0.z")
135 src3 = prereduce("src0.w")
136 unop_horiz(name + "2", output_size, output_type, 2, input_type,
137 final(reduce_(src0, src1)))
138 unop_horiz(name + "3", output_size, output_type, 3, input_type,
139 final(reduce_(reduce_(src0, src1), src2)))
140 unop_horiz(name + "4", output_size, output_type, 4, input_type,
141 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
142
143
144 # These two move instructions differ in what modifiers they support and what
145 # the negate modifier means. Otherwise, they are identical.
146 unop("fmov", tfloat, "src0")
147 unop("imov", tint, "src0")
148
149 unop("ineg", tint, "-src0")
150 unop("fneg", tfloat, "-src0")
151 unop("inot", tint, "~src0") # invert every bit of the integer
152 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
153 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
154 unop("fsign", tfloat, ("bit_size == 64 ? " +
155 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
156 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
157 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
158 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
159 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
160 unop("fsat", tfloat, ("bit_size == 64 ? " +
161 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
162 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
163 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
164 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
165 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
166 unop("fexp2", tfloat, "exp2f(src0)")
167 unop("flog2", tfloat, "log2f(src0)")
168 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
169 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
170 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
171 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
172 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
173 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
174 # Float-to-boolean conversion
175 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
176 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
177 # Boolean-to-float conversion
178 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
179 # Int-to-boolean conversion
180 unop_convert("i2b", tbool, tint32, "src0 != 0")
181 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
182 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
183 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
184 # double-to-float conversion
185 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
186 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
187
188 # Unary floating-point rounding operations.
189
190
191 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
192 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
193 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
194 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
195 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
196
197 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
198
199 # Trigonometric operations.
200
201
202 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
203 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
204
205
206 # Partial derivatives.
207
208
209 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
210 unop("fddy", tfloat, "0.0")
211 unop("fddx_fine", tfloat, "0.0")
212 unop("fddy_fine", tfloat, "0.0")
213 unop("fddx_coarse", tfloat, "0.0")
214 unop("fddy_coarse", tfloat, "0.0")
215
216
217 # Floating point pack and unpack operations.
218
219 def pack_2x16(fmt):
220 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
221 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
222 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
223 """.replace("fmt", fmt))
224
225 def pack_4x8(fmt):
226 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
227 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
228 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
229 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
230 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
231 """.replace("fmt", fmt))
232
233 def unpack_2x16(fmt):
234 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
235 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
236 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
237 """.replace("fmt", fmt))
238
239 def unpack_4x8(fmt):
240 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
241 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
242 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
243 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
244 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
245 """.replace("fmt", fmt))
246
247
248 pack_2x16("snorm")
249 pack_4x8("snorm")
250 pack_2x16("unorm")
251 pack_4x8("unorm")
252 pack_2x16("half")
253 unpack_2x16("snorm")
254 unpack_4x8("snorm")
255 unpack_2x16("unorm")
256 unpack_4x8("unorm")
257 unpack_2x16("half")
258
259 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
260 dst.x = (src0.x & 0xffff) | (src0.y << 16);
261 """)
262
263 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
264 dst.x = (src0.x << 0) |
265 (src0.y << 8) |
266 (src0.z << 16) |
267 (src0.w << 24);
268 """)
269
270 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
271 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
272
273 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
274 "dst.x = src0.x; dst.y = src0.x >> 32;")
275
276 # Lowered floating point unpacking operations.
277
278
279 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
280 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
281 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
282 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
283
284 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
285 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
286
287 # Bit operations, part of ARB_gpu_shader5.
288
289
290 unop("bitfield_reverse", tuint32, """
291 /* we're not winning any awards for speed here, but that's ok */
292 dst = 0;
293 for (unsigned bit = 0; bit < 32; bit++)
294 dst |= ((src0 >> bit) & 1) << (31 - bit);
295 """)
296 unop("bit_count", tuint32, """
297 dst = 0;
298 for (unsigned bit = 0; bit < 32; bit++) {
299 if ((src0 >> bit) & 1)
300 dst++;
301 }
302 """)
303
304 unop_convert("ufind_msb", tint32, tuint32, """
305 dst = -1;
306 for (int bit = 31; bit > 0; bit--) {
307 if ((src0 >> bit) & 1) {
308 dst = bit;
309 break;
310 }
311 }
312 """)
313
314 unop("ifind_msb", tint32, """
315 dst = -1;
316 for (int bit = 31; bit >= 0; bit--) {
317 /* If src0 < 0, we're looking for the first 0 bit.
318 * if src0 >= 0, we're looking for the first 1 bit.
319 */
320 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
321 (!((src0 >> bit) & 1) && (src0 < 0))) {
322 dst = bit;
323 break;
324 }
325 }
326 """)
327
328 unop("find_lsb", tint32, """
329 dst = -1;
330 for (unsigned bit = 0; bit < 32; bit++) {
331 if ((src0 >> bit) & 1) {
332 dst = bit;
333 break;
334 }
335 }
336 """)
337
338
339 for i in xrange(1, 5):
340 for j in xrange(1, 5):
341 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
342
343 def binop_convert(name, out_type, in_type, alg_props, const_expr):
344 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
345
346 def binop(name, ty, alg_props, const_expr):
347 binop_convert(name, ty, ty, alg_props, const_expr)
348
349 def binop_compare(name, ty, alg_props, const_expr):
350 binop_convert(name, tbool, ty, alg_props, const_expr)
351
352 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
353 src2_type, const_expr):
354 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
355 "", const_expr)
356
357 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
358 reduce_expr, final_expr):
359 def final(src):
360 return final_expr.format(src= "(" + src + ")")
361 def reduce_(src0, src1):
362 return reduce_expr.format(src0=src0, src1=src1)
363 def prereduce(src0, src1):
364 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
365 src0 = prereduce("src0.x", "src1.x")
366 src1 = prereduce("src0.y", "src1.y")
367 src2 = prereduce("src0.z", "src1.z")
368 src3 = prereduce("src0.w", "src1.w")
369 opcode(name + "2", output_size, output_type,
370 [2, 2], [src_type, src_type], commutative,
371 final(reduce_(src0, src1)))
372 opcode(name + "3", output_size, output_type,
373 [3, 3], [src_type, src_type], commutative,
374 final(reduce_(reduce_(src0, src1), src2)))
375 opcode(name + "4", output_size, output_type,
376 [4, 4], [src_type, src_type], commutative,
377 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
378
379 binop("fadd", tfloat, commutative + associative, "src0 + src1")
380 binop("iadd", tint, commutative + associative, "src0 + src1")
381 binop("fsub", tfloat, "", "src0 - src1")
382 binop("isub", tint, "", "src0 - src1")
383
384 binop("fmul", tfloat, commutative + associative, "src0 * src1")
385 # low 32-bits of signed/unsigned integer multiply
386 binop("imul", tint, commutative + associative, "src0 * src1")
387 # high 32-bits of signed integer multiply
388 binop("imul_high", tint32, commutative,
389 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
390 # high 32-bits of unsigned integer multiply
391 binop("umul_high", tuint32, commutative,
392 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
393
394 binop("fdiv", tfloat, "", "src0 / src1")
395 binop("idiv", tint, "", "src0 / src1")
396 binop("udiv", tuint, "", "src0 / src1")
397
398 # returns a boolean representing the carry resulting from the addition of
399 # the two unsigned arguments.
400
401 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
402
403 # returns a boolean representing the borrow resulting from the subtraction
404 # of the two unsigned arguments.
405
406 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
407
408 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
409
410 # For signed integers, there are several different possible definitions of
411 # "modulus" or "remainder". We follow the conventions used by LLVM and
412 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
413 # operation while the imod opcode implements the more mathematical
414 # "modulus" operation. For details on the difference, see
415 #
416 # http://mathforum.org/library/drmath/view/52343.html
417
418 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
419 binop("imod", tint, "",
420 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
421 " src0 % src1 : src0 % src1 + src1)")
422 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
423 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
424
425 #
426 # Comparisons
427 #
428
429
430 # these integer-aware comparisons return a boolean (0 or ~0)
431
432 binop_compare("flt", tfloat, "", "src0 < src1")
433 binop_compare("fge", tfloat, "", "src0 >= src1")
434 binop_compare("feq", tfloat, commutative, "src0 == src1")
435 binop_compare("fne", tfloat, commutative, "src0 != src1")
436 binop_compare("ilt", tint, "", "src0 < src1")
437 binop_compare("ige", tint, "", "src0 >= src1")
438 binop_compare("ieq", tint, commutative, "src0 == src1")
439 binop_compare("ine", tint, commutative, "src0 != src1")
440 binop_compare("ult", tuint, "", "src0 < src1")
441 binop_compare("uge", tuint, "", "src0 >= src1")
442
443 # integer-aware GLSL-style comparisons that compare floats and ints
444
445 binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}",
446 "{src0} && {src1}", "{src}")
447 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
448 "{src0} || {src1}", "{src}")
449 binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}",
450 "{src0} && {src1}", "{src}")
451 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
452 "{src0} || {src1}", "{src}")
453
454 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
455
456 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
457 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
458 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
459 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
460
461 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
462 # and false respectively
463
464 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
465 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
466 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
467 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
468
469
470 binop("ishl", tint, "", "src0 << src1")
471 binop("ishr", tint, "", "src0 >> src1")
472 binop("ushr", tuint, "", "src0 >> src1")
473
474 # bitwise logic operators
475 #
476 # These are also used as boolean and, or, xor for hardware supporting
477 # integers.
478
479
480 binop("iand", tuint, commutative + associative, "src0 & src1")
481 binop("ior", tuint, commutative + associative, "src0 | src1")
482 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
483
484
485 # floating point logic operators
486 #
487 # These use (src != 0.0) for testing the truth of the input, and output 1.0
488 # for true and 0.0 for false
489
490 binop("fand", tfloat32, commutative,
491 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
492 binop("for", tfloat32, commutative,
493 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
494 binop("fxor", tfloat32, commutative,
495 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
496
497 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
498 "{src}")
499
500 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
501 "{src0} * {src1}", "{src0} + {src1}", "{src}")
502
503 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
504 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
505 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
506 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
507
508 binop("fmin", tfloat, "", "fminf(src0, src1)")
509 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
510 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
511 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
512 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
513 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
514
515 # Saturated vector add for 4 8bit ints.
516 binop("usadd_4x8", tint32, commutative + associative, """
517 dst = 0;
518 for (int i = 0; i < 32; i += 8) {
519 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
520 }
521 """)
522
523 # Saturated vector subtract for 4 8bit ints.
524 binop("ussub_4x8", tint32, "", """
525 dst = 0;
526 for (int i = 0; i < 32; i += 8) {
527 int src0_chan = (src0 >> i) & 0xff;
528 int src1_chan = (src1 >> i) & 0xff;
529 if (src0_chan > src1_chan)
530 dst |= (src0_chan - src1_chan) << i;
531 }
532 """)
533
534 # vector min for 4 8bit ints.
535 binop("umin_4x8", tint32, commutative + associative, """
536 dst = 0;
537 for (int i = 0; i < 32; i += 8) {
538 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
539 }
540 """)
541
542 # vector max for 4 8bit ints.
543 binop("umax_4x8", tint32, commutative + associative, """
544 dst = 0;
545 for (int i = 0; i < 32; i += 8) {
546 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
547 }
548 """)
549
550 # unorm multiply: (a * b) / 255.
551 binop("umul_unorm_4x8", tint32, commutative + associative, """
552 dst = 0;
553 for (int i = 0; i < 32; i += 8) {
554 int src0_chan = (src0 >> i) & 0xff;
555 int src1_chan = (src1 >> i) & 0xff;
556 dst |= ((src0_chan * src1_chan) / 255) << i;
557 }
558 """)
559
560 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
561
562 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
563 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
564
565 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
566 "src0 | ((uint64_t)src1 << 32)")
567
568 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
569 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
570 # if either of its arguments are 32.
571 binop_convert("bfm", tuint32, tint32, "", """
572 int bits = src0, offset = src1;
573 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
574 dst = 0; /* undefined */
575 else
576 dst = ((1u << bits) - 1) << offset;
577 """)
578
579 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
580 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
581 /* flush denormals to zero. */
582 if (!isnormal(dst))
583 dst = copysignf(0.0f, src0);
584 """)
585
586 # Combines the first component of each input to make a 2-component vector.
587
588 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
589 dst.x = src0.x;
590 dst.y = src1.x;
591 """)
592
593 # Byte extraction
594 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
595 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
596
597 # Word extraction
598 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
599 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
600
601
602 def triop(name, ty, const_expr):
603 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
604 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
605 opcode(name, output_size, tuint,
606 [src1_size, src2_size, src3_size],
607 [tuint, tuint, tuint], "", const_expr)
608
609 triop("ffma", tfloat, "src0 * src1 + src2")
610
611 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
612
613 # Conditional Select
614 #
615 # A vector conditional select instruction (like ?:, but operating per-
616 # component on vectors). There are two versions, one for floating point
617 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
618
619
620 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
621 opcode("bcsel", 0, tuint, [0, 0, 0],
622 [tbool, tuint, tuint], "", "src0 ? src1 : src2")
623
624 # SM5 bfi assembly
625 triop("bfi", tuint32, """
626 unsigned mask = src0, insert = src1, base = src2;
627 if (mask == 0) {
628 dst = base;
629 } else {
630 unsigned tmp = mask;
631 while (!(tmp & 1)) {
632 tmp >>= 1;
633 insert <<= 1;
634 }
635 dst = (base & ~mask) | (insert & mask);
636 }
637 """)
638
639 # SM5 ubfe/ibfe assembly
640 opcode("ubfe", 0, tuint32,
641 [0, 0, 0], [tuint32, tint32, tint32], "", """
642 unsigned base = src0;
643 int offset = src1, bits = src2;
644 if (bits == 0) {
645 dst = 0;
646 } else if (bits < 0 || offset < 0) {
647 dst = 0; /* undefined */
648 } else if (offset + bits < 32) {
649 dst = (base << (32 - bits - offset)) >> (32 - bits);
650 } else {
651 dst = base >> offset;
652 }
653 """)
654 opcode("ibfe", 0, tint32,
655 [0, 0, 0], [tint32, tint32, tint32], "", """
656 int base = src0;
657 int offset = src1, bits = src2;
658 if (bits == 0) {
659 dst = 0;
660 } else if (bits < 0 || offset < 0) {
661 dst = 0; /* undefined */
662 } else if (offset + bits < 32) {
663 dst = (base << (32 - bits - offset)) >> (32 - bits);
664 } else {
665 dst = base >> offset;
666 }
667 """)
668
669 # GLSL bitfieldExtract()
670 opcode("ubitfield_extract", 0, tuint32,
671 [0, 0, 0], [tuint32, tint32, tint32], "", """
672 unsigned base = src0;
673 int offset = src1, bits = src2;
674 if (bits == 0) {
675 dst = 0;
676 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
677 dst = 0; /* undefined per the spec */
678 } else {
679 dst = (base >> offset) & ((1ull << bits) - 1);
680 }
681 """)
682 opcode("ibitfield_extract", 0, tint32,
683 [0, 0, 0], [tint32, tint32, tint32], "", """
684 int base = src0;
685 int offset = src1, bits = src2;
686 if (bits == 0) {
687 dst = 0;
688 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
689 dst = 0;
690 } else {
691 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
692 }
693 """)
694
695 # Combines the first component of each input to make a 3-component vector.
696
697 triop_horiz("vec3", 3, 1, 1, 1, """
698 dst.x = src0.x;
699 dst.y = src1.x;
700 dst.z = src2.x;
701 """)
702
703 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
704 src4_size, const_expr):
705 opcode(name, output_size, tuint,
706 [src1_size, src2_size, src3_size, src4_size],
707 [tuint, tuint, tuint, tuint],
708 "", const_expr)
709
710 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
711 [tuint32, tuint32, tint32, tint32], "", """
712 unsigned base = src0, insert = src1;
713 int offset = src2, bits = src3;
714 if (bits == 0) {
715 dst = 0;
716 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
717 dst = 0;
718 } else {
719 unsigned mask = ((1ull << bits) - 1) << offset;
720 dst = (base & ~mask) | ((insert << bits) & mask);
721 }
722 """)
723
724 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
725 dst.x = src0.x;
726 dst.y = src1.x;
727 dst.z = src2.x;
728 dst.w = src3.x;
729 """)
730
731