nir: Shift count for shift opcodes is always 32-bits
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #! /usr/bin/env python
2 #
3 # Copyright (C) 2014 Connor Abbott
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining a
6 # copy of this software and associated documentation files (the "Software"),
7 # to deal in the Software without restriction, including without limitation
8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 # and/or sell copies of the Software, and to permit persons to whom the
10 # Software is furnished to do so, subject to the following conditions:
11 #
12 # The above copyright notice and this permission notice (including the next
13 # paragraph) shall be included in all copies or substantial portions of the
14 # Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 # IN THE SOFTWARE.
23 #
24 # Authors:
25 # Connor Abbott (cwabbott0@gmail.com)
26
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - algebraic_properties is a space-seperated string, where nir_op_is_ is
43 prepended before each entry
44 - const_expr is an expression or series of statements that computes the
45 constant value of the opcode given the constant values of its inputs.
46
47 Constant expressions are formed from the variables src0, src1, ...,
48 src(N-1), where N is the number of arguments. The output of the
49 expression should be stored in the dst variable. Per-component input
50 and output variables will be scalars and non-per-component input and
51 output variables will be a struct with fields named x, y, z, and w
52 all of the correct type. Input and output variables can be assumed
53 to already be of the correct type and need no conversion. In
54 particular, the conversion from the C bool type to/from NIR_TRUE and
55 NIR_FALSE happens automatically.
56
57 For per-component instructions, the entire expression will be
58 executed once for each component. For non-per-component
59 instructions, the expression is expected to store the correct values
60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
61 constant expression, an assignment to dst will happen automatically
62 and the result will be equivalent to "dst = <expression>" for
63 per-component instructions and "dst.x = dst.y = ... = <expression>"
64 for non-per-component instructions.
65 """
66 assert isinstance(name, str)
67 assert isinstance(output_size, int)
68 assert isinstance(output_type, str)
69 assert isinstance(input_sizes, list)
70 assert isinstance(input_sizes[0], int)
71 assert isinstance(input_types, list)
72 assert isinstance(input_types[0], str)
73 assert isinstance(algebraic_properties, str)
74 assert isinstance(const_expr, str)
75 assert len(input_sizes) == len(input_types)
76 assert 0 <= output_size <= 4
77 for size in input_sizes:
78 assert 0 <= size <= 4
79 if output_size != 0:
80 assert size != 0
81 self.name = name
82 self.num_inputs = len(input_sizes)
83 self.output_size = output_size
84 self.output_type = output_type
85 self.input_sizes = input_sizes
86 self.input_types = input_types
87 self.algebraic_properties = algebraic_properties
88 self.const_expr = const_expr
89
90 # helper variables for strings
91 tfloat = "float"
92 tint = "int"
93 tbool = "bool32"
94 tuint = "uint"
95 tfloat32 = "float32"
96 tint32 = "int32"
97 tuint32 = "uint32"
98 tint64 = "int64"
99 tuint64 = "uint64"
100 tfloat64 = "float64"
101
102 commutative = "commutative "
103 associative = "associative "
104
105 # global dictionary of opcodes
106 opcodes = {}
107
108 def opcode(name, output_size, output_type, input_sizes, input_types,
109 algebraic_properties, const_expr):
110 assert name not in opcodes
111 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
112 input_types, algebraic_properties, const_expr)
113
114 def unop_convert(name, out_type, in_type, const_expr):
115 opcode(name, 0, out_type, [0], [in_type], "", const_expr)
116
117 def unop(name, ty, const_expr):
118 opcode(name, 0, ty, [0], [ty], "", const_expr)
119
120 def unop_horiz(name, output_size, output_type, input_size, input_type,
121 const_expr):
122 opcode(name, output_size, output_type, [input_size], [input_type], "",
123 const_expr)
124
125 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
126 reduce_expr, final_expr):
127 def prereduce(src):
128 return "(" + prereduce_expr.format(src=src) + ")"
129 def final(src):
130 return final_expr.format(src="(" + src + ")")
131 def reduce_(src0, src1):
132 return reduce_expr.format(src0=src0, src1=src1)
133 src0 = prereduce("src0.x")
134 src1 = prereduce("src0.y")
135 src2 = prereduce("src0.z")
136 src3 = prereduce("src0.w")
137 unop_horiz(name + "2", output_size, output_type, 2, input_type,
138 final(reduce_(src0, src1)))
139 unop_horiz(name + "3", output_size, output_type, 3, input_type,
140 final(reduce_(reduce_(src0, src1), src2)))
141 unop_horiz(name + "4", output_size, output_type, 4, input_type,
142 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
143
144
145 # These two move instructions differ in what modifiers they support and what
146 # the negate modifier means. Otherwise, they are identical.
147 unop("fmov", tfloat, "src0")
148 unop("imov", tint, "src0")
149
150 unop("ineg", tint, "-src0")
151 unop("fneg", tfloat, "-src0")
152 unop("inot", tint, "~src0") # invert every bit of the integer
153 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
154 "((src0 == 0.0f) ? 1.0f : 0.0f)"))
155 unop("fsign", tfloat, ("bit_size == 64 ? " +
156 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
157 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
158 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
159 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
160 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
161 unop("fsat", tfloat, ("bit_size == 64 ? " +
162 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
163 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
164 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
165 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
166 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
167 unop("fexp2", tfloat, "exp2f(src0)")
168 unop("flog2", tfloat, "log2f(src0)")
169 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
170 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
171 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
172 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
173 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
174 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
175 unop_convert("i2i32", tint32, tint, "src0") # General int (int8_t, int64_t, etc.) to int32_t conversion
176 unop_convert("u2i32", tint32, tuint, "src0") # General uint (uint8_t, uint64_t, etc.) to int32_t conversion
177 unop_convert("i2u32", tuint32, tint, "src0") # General int (int8_t, int64_t, etc.) to uint32_t conversion
178 unop_convert("u2u32", tuint32, tuint, "src0") # General uint (uint8_t, uint64_t, etc.) to uint32_t conversion
179 unop_convert("i2i64", tint64, tint, "src0") # General int (int8_t, int32_t, etc.) to int64_t conversion
180 unop_convert("u2i64", tint64, tuint, "src0") # General uint (uint8_t, uint64_t, etc.) to int64_t conversion
181 unop_convert("f2i64", tint64, tfloat, "src0") # General float (float or double) to int64_t conversion
182 unop_convert("i2u64", tuint64, tint, "src0") # General int (int8_t, int64_t, etc.) to uint64_t conversion
183 unop_convert("u2u64", tuint64, tuint, "src0") # General uint (uint8_t, uint32_t, etc.) to uint64_t conversion
184 unop_convert("f2u64", tuint64, tfloat, "src0") # General float (float or double) to uint64_t conversion
185 unop_convert("i642f", tfloat32, tint64, "src0") # int64_t-to-float conversion.
186 unop_convert("i642d", tfloat64, tint64, "src0") # int64_t-to-double conversion.
187 unop_convert("u642f", tfloat32, tuint64, "src0") # uint64_t-to-float conversion.
188 unop_convert("u642d", tfloat64, tuint64, "src0") # uint64_t-to-double conversion.
189
190 # Float-to-boolean conversion
191 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
192 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
193 # Boolean-to-float conversion
194 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
195 # Int-to-boolean conversion
196 unop_convert("i2b", tbool, tint, "src0 != 0")
197 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
198 unop_convert("b2i64", tint64, tbool, "src0 ? 1 : 0") # Boolean-to-int64_t conversion.
199 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
200 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
201 # double-to-float conversion
202 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
203 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
204
205 # Unary floating-point rounding operations.
206
207
208 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
209 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
210 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
211 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
212 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
213
214 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
215
216 # Trigonometric operations.
217
218
219 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
220 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
221
222
223 # Partial derivatives.
224
225
226 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
227 unop("fddy", tfloat, "0.0")
228 unop("fddx_fine", tfloat, "0.0")
229 unop("fddy_fine", tfloat, "0.0")
230 unop("fddx_coarse", tfloat, "0.0")
231 unop("fddy_coarse", tfloat, "0.0")
232
233
234 # Floating point pack and unpack operations.
235
236 def pack_2x16(fmt):
237 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
238 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
239 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
240 """.replace("fmt", fmt))
241
242 def pack_4x8(fmt):
243 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
244 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
245 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
246 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
247 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
248 """.replace("fmt", fmt))
249
250 def unpack_2x16(fmt):
251 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
252 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
253 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
254 """.replace("fmt", fmt))
255
256 def unpack_4x8(fmt):
257 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
258 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
259 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
260 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
261 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
262 """.replace("fmt", fmt))
263
264
265 pack_2x16("snorm")
266 pack_4x8("snorm")
267 pack_2x16("unorm")
268 pack_4x8("unorm")
269 pack_2x16("half")
270 unpack_2x16("snorm")
271 unpack_4x8("snorm")
272 unpack_2x16("unorm")
273 unpack_4x8("unorm")
274 unpack_2x16("half")
275
276 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
277 dst.x = (src0.x & 0xffff) | (src0.y << 16);
278 """)
279
280 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
281 dst.x = (src0.x << 0) |
282 (src0.y << 8) |
283 (src0.z << 16) |
284 (src0.w << 24);
285 """)
286
287 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
288 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
289
290 unop_horiz("pack_int_2x32", 1, tint64, 2, tint32,
291 "dst.x = src0.x | ((int64_t)src0.y << 32);")
292
293 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
294 "dst.x = src0.x; dst.y = src0.x >> 32;")
295
296 unop_horiz("unpack_int_2x32", 2, tint32, 1, tint64,
297 "dst.x = src0.x; dst.y = src0.x >> 32;")
298
299 # Lowered floating point unpacking operations.
300
301
302 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
303 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
304 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
305 "unpack_half_1x16((uint16_t)(src0.x >> 16))")
306
307 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
308 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
309 unop_convert("unpack_int_2x32_split_x", tuint32, tuint64, "src0")
310 unop_convert("unpack_int_2x32_split_y", tuint32, tuint64, "src0 >> 32")
311
312 # Bit operations, part of ARB_gpu_shader5.
313
314
315 unop("bitfield_reverse", tuint32, """
316 /* we're not winning any awards for speed here, but that's ok */
317 dst = 0;
318 for (unsigned bit = 0; bit < 32; bit++)
319 dst |= ((src0 >> bit) & 1) << (31 - bit);
320 """)
321 unop("bit_count", tuint32, """
322 dst = 0;
323 for (unsigned bit = 0; bit < 32; bit++) {
324 if ((src0 >> bit) & 1)
325 dst++;
326 }
327 """)
328
329 unop_convert("ufind_msb", tint32, tuint32, """
330 dst = -1;
331 for (int bit = 31; bit > 0; bit--) {
332 if ((src0 >> bit) & 1) {
333 dst = bit;
334 break;
335 }
336 }
337 """)
338
339 unop("ifind_msb", tint32, """
340 dst = -1;
341 for (int bit = 31; bit >= 0; bit--) {
342 /* If src0 < 0, we're looking for the first 0 bit.
343 * if src0 >= 0, we're looking for the first 1 bit.
344 */
345 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
346 (!((src0 >> bit) & 1) && (src0 < 0))) {
347 dst = bit;
348 break;
349 }
350 }
351 """)
352
353 unop("find_lsb", tint32, """
354 dst = -1;
355 for (unsigned bit = 0; bit < 32; bit++) {
356 if ((src0 >> bit) & 1) {
357 dst = bit;
358 break;
359 }
360 }
361 """)
362
363
364 for i in xrange(1, 5):
365 for j in xrange(1, 5):
366 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
367
368 def binop_convert(name, out_type, in_type, alg_props, const_expr):
369 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
370
371 def binop(name, ty, alg_props, const_expr):
372 binop_convert(name, ty, ty, alg_props, const_expr)
373
374 def binop_compare(name, ty, alg_props, const_expr):
375 binop_convert(name, tbool, ty, alg_props, const_expr)
376
377 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
378 src2_type, const_expr):
379 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
380 "", const_expr)
381
382 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
383 reduce_expr, final_expr):
384 def final(src):
385 return final_expr.format(src= "(" + src + ")")
386 def reduce_(src0, src1):
387 return reduce_expr.format(src0=src0, src1=src1)
388 def prereduce(src0, src1):
389 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
390 src0 = prereduce("src0.x", "src1.x")
391 src1 = prereduce("src0.y", "src1.y")
392 src2 = prereduce("src0.z", "src1.z")
393 src3 = prereduce("src0.w", "src1.w")
394 opcode(name + "2", output_size, output_type,
395 [2, 2], [src_type, src_type], commutative,
396 final(reduce_(src0, src1)))
397 opcode(name + "3", output_size, output_type,
398 [3, 3], [src_type, src_type], commutative,
399 final(reduce_(reduce_(src0, src1), src2)))
400 opcode(name + "4", output_size, output_type,
401 [4, 4], [src_type, src_type], commutative,
402 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
403
404 binop("fadd", tfloat, commutative + associative, "src0 + src1")
405 binop("iadd", tint, commutative + associative, "src0 + src1")
406 binop("fsub", tfloat, "", "src0 - src1")
407 binop("isub", tint, "", "src0 - src1")
408
409 binop("fmul", tfloat, commutative + associative, "src0 * src1")
410 # low 32-bits of signed/unsigned integer multiply
411 binop("imul", tint, commutative + associative, "src0 * src1")
412 # high 32-bits of signed integer multiply
413 binop("imul_high", tint32, commutative,
414 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
415 # high 32-bits of unsigned integer multiply
416 binop("umul_high", tuint32, commutative,
417 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
418
419 binop("fdiv", tfloat, "", "src0 / src1")
420 binop("idiv", tint, "", "src0 / src1")
421 binop("udiv", tuint, "", "src0 / src1")
422
423 # returns a boolean representing the carry resulting from the addition of
424 # the two unsigned arguments.
425
426 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
427
428 # returns a boolean representing the borrow resulting from the subtraction
429 # of the two unsigned arguments.
430
431 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
432
433 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
434
435 # For signed integers, there are several different possible definitions of
436 # "modulus" or "remainder". We follow the conventions used by LLVM and
437 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
438 # operation while the imod opcode implements the more mathematical
439 # "modulus" operation. For details on the difference, see
440 #
441 # http://mathforum.org/library/drmath/view/52343.html
442
443 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
444 binop("imod", tint, "",
445 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
446 " src0 % src1 : src0 % src1 + src1)")
447 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
448 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
449
450 #
451 # Comparisons
452 #
453
454
455 # these integer-aware comparisons return a boolean (0 or ~0)
456
457 binop_compare("flt", tfloat, "", "src0 < src1")
458 binop_compare("fge", tfloat, "", "src0 >= src1")
459 binop_compare("feq", tfloat, commutative, "src0 == src1")
460 binop_compare("fne", tfloat, commutative, "src0 != src1")
461 binop_compare("ilt", tint, "", "src0 < src1")
462 binop_compare("ige", tint, "", "src0 >= src1")
463 binop_compare("ieq", tint, commutative, "src0 == src1")
464 binop_compare("ine", tint, commutative, "src0 != src1")
465 binop_compare("ult", tuint, "", "src0 < src1")
466 binop_compare("uge", tuint, "", "src0 >= src1")
467
468 # integer-aware GLSL-style comparisons that compare floats and ints
469
470 binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}",
471 "{src0} && {src1}", "{src}")
472 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
473 "{src0} || {src1}", "{src}")
474 binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}",
475 "{src0} && {src1}", "{src}")
476 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
477 "{src0} || {src1}", "{src}")
478
479 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
480
481 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
482 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
483 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
484 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
485
486 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
487 # and false respectively
488
489 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
490 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
491 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
492 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
493
494
495 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
496 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
497 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
498
499 # bitwise logic operators
500 #
501 # These are also used as boolean and, or, xor for hardware supporting
502 # integers.
503
504
505 binop("iand", tuint, commutative + associative, "src0 & src1")
506 binop("ior", tuint, commutative + associative, "src0 | src1")
507 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
508
509
510 # floating point logic operators
511 #
512 # These use (src != 0.0) for testing the truth of the input, and output 1.0
513 # for true and 0.0 for false
514
515 binop("fand", tfloat32, commutative,
516 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
517 binop("for", tfloat32, commutative,
518 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
519 binop("fxor", tfloat32, commutative,
520 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
521
522 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
523 "{src}")
524
525 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
526 "{src0} * {src1}", "{src0} + {src1}", "{src}")
527
528 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
529 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
530 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
531 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
532
533 binop("fmin", tfloat, "", "fminf(src0, src1)")
534 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
535 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
536 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
537 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
538 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
539
540 # Saturated vector add for 4 8bit ints.
541 binop("usadd_4x8", tint32, commutative + associative, """
542 dst = 0;
543 for (int i = 0; i < 32; i += 8) {
544 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
545 }
546 """)
547
548 # Saturated vector subtract for 4 8bit ints.
549 binop("ussub_4x8", tint32, "", """
550 dst = 0;
551 for (int i = 0; i < 32; i += 8) {
552 int src0_chan = (src0 >> i) & 0xff;
553 int src1_chan = (src1 >> i) & 0xff;
554 if (src0_chan > src1_chan)
555 dst |= (src0_chan - src1_chan) << i;
556 }
557 """)
558
559 # vector min for 4 8bit ints.
560 binop("umin_4x8", tint32, commutative + associative, """
561 dst = 0;
562 for (int i = 0; i < 32; i += 8) {
563 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
564 }
565 """)
566
567 # vector max for 4 8bit ints.
568 binop("umax_4x8", tint32, commutative + associative, """
569 dst = 0;
570 for (int i = 0; i < 32; i += 8) {
571 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
572 }
573 """)
574
575 # unorm multiply: (a * b) / 255.
576 binop("umul_unorm_4x8", tint32, commutative + associative, """
577 dst = 0;
578 for (int i = 0; i < 32; i += 8) {
579 int src0_chan = (src0 >> i) & 0xff;
580 int src1_chan = (src1 >> i) & 0xff;
581 dst |= ((src0_chan * src1_chan) / 255) << i;
582 }
583 """)
584
585 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
586
587 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
588 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
589
590 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
591 "src0 | ((uint64_t)src1 << 32)")
592
593 binop_convert("pack_int_2x32_split", tuint64, tuint32, "",
594 "src0 | ((uint64_t)src1 << 32)")
595
596 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
597 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
598 # if either of its arguments are 32.
599 binop_convert("bfm", tuint32, tint32, "", """
600 int bits = src0, offset = src1;
601 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
602 dst = 0; /* undefined */
603 else
604 dst = ((1u << bits) - 1) << offset;
605 """)
606
607 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
608 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
609 /* flush denormals to zero. */
610 if (!isnormal(dst))
611 dst = copysignf(0.0f, src0);
612 """)
613
614 # Combines the first component of each input to make a 2-component vector.
615
616 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
617 dst.x = src0.x;
618 dst.y = src1.x;
619 """)
620
621 # Byte extraction
622 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
623 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
624
625 # Word extraction
626 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
627 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
628
629
630 def triop(name, ty, const_expr):
631 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
632 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
633 opcode(name, output_size, tuint,
634 [src1_size, src2_size, src3_size],
635 [tuint, tuint, tuint], "", const_expr)
636
637 triop("ffma", tfloat, "src0 * src1 + src2")
638
639 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
640
641 # Conditional Select
642 #
643 # A vector conditional select instruction (like ?:, but operating per-
644 # component on vectors). There are two versions, one for floating point
645 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
646
647
648 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
649 opcode("bcsel", 0, tuint, [0, 0, 0],
650 [tbool, tuint, tuint], "", "src0 ? src1 : src2")
651
652 # SM5 bfi assembly
653 triop("bfi", tuint32, """
654 unsigned mask = src0, insert = src1, base = src2;
655 if (mask == 0) {
656 dst = base;
657 } else {
658 unsigned tmp = mask;
659 while (!(tmp & 1)) {
660 tmp >>= 1;
661 insert <<= 1;
662 }
663 dst = (base & ~mask) | (insert & mask);
664 }
665 """)
666
667 # SM5 ubfe/ibfe assembly
668 opcode("ubfe", 0, tuint32,
669 [0, 0, 0], [tuint32, tint32, tint32], "", """
670 unsigned base = src0;
671 int offset = src1, bits = src2;
672 if (bits == 0) {
673 dst = 0;
674 } else if (bits < 0 || offset < 0) {
675 dst = 0; /* undefined */
676 } else if (offset + bits < 32) {
677 dst = (base << (32 - bits - offset)) >> (32 - bits);
678 } else {
679 dst = base >> offset;
680 }
681 """)
682 opcode("ibfe", 0, tint32,
683 [0, 0, 0], [tint32, tint32, tint32], "", """
684 int base = src0;
685 int offset = src1, bits = src2;
686 if (bits == 0) {
687 dst = 0;
688 } else if (bits < 0 || offset < 0) {
689 dst = 0; /* undefined */
690 } else if (offset + bits < 32) {
691 dst = (base << (32 - bits - offset)) >> (32 - bits);
692 } else {
693 dst = base >> offset;
694 }
695 """)
696
697 # GLSL bitfieldExtract()
698 opcode("ubitfield_extract", 0, tuint32,
699 [0, 0, 0], [tuint32, tint32, tint32], "", """
700 unsigned base = src0;
701 int offset = src1, bits = src2;
702 if (bits == 0) {
703 dst = 0;
704 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
705 dst = 0; /* undefined per the spec */
706 } else {
707 dst = (base >> offset) & ((1ull << bits) - 1);
708 }
709 """)
710 opcode("ibitfield_extract", 0, tint32,
711 [0, 0, 0], [tint32, tint32, tint32], "", """
712 int base = src0;
713 int offset = src1, bits = src2;
714 if (bits == 0) {
715 dst = 0;
716 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
717 dst = 0;
718 } else {
719 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
720 }
721 """)
722
723 # Combines the first component of each input to make a 3-component vector.
724
725 triop_horiz("vec3", 3, 1, 1, 1, """
726 dst.x = src0.x;
727 dst.y = src1.x;
728 dst.z = src2.x;
729 """)
730
731 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
732 src4_size, const_expr):
733 opcode(name, output_size, tuint,
734 [src1_size, src2_size, src3_size, src4_size],
735 [tuint, tuint, tuint, tuint],
736 "", const_expr)
737
738 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
739 [tuint32, tuint32, tint32, tint32], "", """
740 unsigned base = src0, insert = src1;
741 int offset = src2, bits = src3;
742 if (bits == 0) {
743 dst = 0;
744 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
745 dst = 0;
746 } else {
747 unsigned mask = ((1ull << bits) - 1) << offset;
748 dst = (base & ~mask) | ((insert << bits) & mask);
749 }
750 """)
751
752 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
753 dst.x = src0.x;
754 dst.y = src1.x;
755 dst.z = src2.x;
756 dst.w = src3.x;
757 """)
758
759