nir: Add fclamp_pos opcode
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool8 = "bool8"
99 tbool16 = "bool16"
100 tbool32 = "bool32"
101 tuint = "uint"
102 tuint16 = "uint16"
103 tfloat16 = "float16"
104 tfloat32 = "float32"
105 tint32 = "int32"
106 tuint32 = "uint32"
107 tint64 = "int64"
108 tuint64 = "uint64"
109 tfloat64 = "float64"
110
111 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
112
113 def type_has_size(type_):
114 m = _TYPE_SPLIT_RE.match(type_)
115 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
116 return m.group('bits') is not None
117
118 def type_size(type_):
119 m = _TYPE_SPLIT_RE.match(type_)
120 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
121 assert m.group('bits') is not None, \
122 'NIR type string has no bit size: "{}"'.format(type_)
123 return int(m.group('bits'))
124
125 def type_sizes(type_):
126 if type_has_size(type_):
127 return [type_size(type_)]
128 elif type_ == 'bool':
129 return [1, 8, 16, 32]
130 elif type_ == 'float':
131 return [16, 32, 64]
132 else:
133 return [1, 8, 16, 32, 64]
134
135 def type_base_type(type_):
136 m = _TYPE_SPLIT_RE.match(type_)
137 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
138 return m.group('type')
139
140 # Operation where the first two sources are commutative.
141 #
142 # For 2-source operations, this just mathematical commutativity. Some
143 # 3-source operations, like ffma, are only commutative in the first two
144 # sources.
145 _2src_commutative = "2src_commutative "
146 associative = "associative "
147
148 # global dictionary of opcodes
149 opcodes = {}
150
151 def opcode(name, output_size, output_type, input_sizes, input_types,
152 is_conversion, algebraic_properties, const_expr):
153 assert name not in opcodes
154 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
155 input_types, is_conversion, algebraic_properties,
156 const_expr)
157
158 def unop_convert(name, out_type, in_type, const_expr):
159 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
160
161 def unop(name, ty, const_expr):
162 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
163
164 def unop_horiz(name, output_size, output_type, input_size, input_type,
165 const_expr):
166 opcode(name, output_size, output_type, [input_size], [input_type],
167 False, "", const_expr)
168
169 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
170 reduce_expr, final_expr):
171 def prereduce(src):
172 return "(" + prereduce_expr.format(src=src) + ")"
173 def final(src):
174 return final_expr.format(src="(" + src + ")")
175 def reduce_(src0, src1):
176 return reduce_expr.format(src0=src0, src1=src1)
177 src0 = prereduce("src0.x")
178 src1 = prereduce("src0.y")
179 src2 = prereduce("src0.z")
180 src3 = prereduce("src0.w")
181 unop_horiz(name + "2", output_size, output_type, 2, input_type,
182 final(reduce_(src0, src1)))
183 unop_horiz(name + "3", output_size, output_type, 3, input_type,
184 final(reduce_(reduce_(src0, src1), src2)))
185 unop_horiz(name + "4", output_size, output_type, 4, input_type,
186 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
187
188 def unop_numeric_convert(name, out_type, in_type, const_expr):
189 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
190
191 unop("mov", tuint, "src0")
192
193 unop("ineg", tint, "-src0")
194 unop("fneg", tfloat, "-src0")
195 unop("inot", tint, "~src0") # invert every bit of the integer
196 unop("fsign", tfloat, ("bit_size == 64 ? " +
197 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
198 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
199 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
200 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
201 unop("fabs", tfloat, "fabs(src0)")
202 unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
203 unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
204 unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)"))
205 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
206 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
207 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
208 unop("fexp2", tfloat, "exp2f(src0)")
209 unop("flog2", tfloat, "log2f(src0)")
210
211 # Generate all of the numeric conversion opcodes
212 for src_t in [tint, tuint, tfloat, tbool]:
213 if src_t == tbool:
214 dst_types = [tfloat, tint, tbool]
215 elif src_t == tint:
216 dst_types = [tfloat, tint, tbool]
217 elif src_t == tuint:
218 dst_types = [tfloat, tuint]
219 elif src_t == tfloat:
220 dst_types = [tint, tuint, tfloat, tbool]
221
222 for dst_t in dst_types:
223 for dst_bit_size in type_sizes(dst_t):
224 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
225 rnd_modes = ['_rtne', '_rtz', '']
226 for rnd_mode in rnd_modes:
227 if rnd_mode == '_rtne':
228 conv_expr = """
229 if (bit_size > 16) {
230 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
231 } else {
232 dst = src0;
233 }
234 """
235 elif rnd_mode == '_rtz':
236 conv_expr = """
237 if (bit_size > 16) {
238 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
239 } else {
240 dst = src0;
241 }
242 """
243 else:
244 conv_expr = "src0"
245
246 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
247 dst_t[0],
248 dst_bit_size,
249 rnd_mode),
250 dst_t + str(dst_bit_size),
251 src_t, conv_expr)
252 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
253 conv_expr = """
254 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
255 dst = _mesa_double_to_float_rtz(src0);
256 } else {
257 dst = src0;
258 }
259 """
260 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
261 dst_bit_size),
262 dst_t + str(dst_bit_size), src_t, conv_expr)
263 else:
264 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
265 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
266 dst_bit_size),
267 dst_t + str(dst_bit_size), src_t, conv_expr)
268
269 # Special opcode that is the same as f2f16 except that it is safe to remove it
270 # if the result is immediately converted back to float32 again. This is
271 # generated as part of the precision lowering pass. mp stands for medium
272 # precision.
273 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
274
275 # Unary floating-point rounding operations.
276
277
278 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
279 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
280 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
281 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
282 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
283
284 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
285
286 # Trigonometric operations.
287
288
289 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
290 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
291
292 # dfrexp
293 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
294 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
295
296 # Partial derivatives.
297
298
299 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
300 unop("fddy", tfloat, "0.0")
301 unop("fddx_fine", tfloat, "0.0")
302 unop("fddy_fine", tfloat, "0.0")
303 unop("fddx_coarse", tfloat, "0.0")
304 unop("fddy_coarse", tfloat, "0.0")
305
306
307 # Floating point pack and unpack operations.
308
309 def pack_2x16(fmt):
310 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
311 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
312 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
313 """.replace("fmt", fmt))
314
315 def pack_4x8(fmt):
316 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
317 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
318 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
319 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
320 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
321 """.replace("fmt", fmt))
322
323 def unpack_2x16(fmt):
324 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
325 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
326 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
327 """.replace("fmt", fmt))
328
329 def unpack_4x8(fmt):
330 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
331 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
332 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
333 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
334 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
335 """.replace("fmt", fmt))
336
337
338 pack_2x16("snorm")
339 pack_4x8("snorm")
340 pack_2x16("unorm")
341 pack_4x8("unorm")
342 pack_2x16("half")
343 unpack_2x16("snorm")
344 unpack_4x8("snorm")
345 unpack_2x16("unorm")
346 unpack_4x8("unorm")
347 unpack_2x16("half")
348
349 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
350 dst.x = (src0.x & 0xffff) | (src0.y << 16);
351 """)
352
353 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
354 dst.x = (src0.x << 0) |
355 (src0.y << 8) |
356 (src0.z << 16) |
357 (src0.w << 24);
358 """)
359
360 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
361 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
362
363 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
364 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
365
366 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
367 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
368
369 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
370 "dst.x = src0.x; dst.y = src0.x >> 32;")
371
372 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
373 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
374
375 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
376 "dst.x = src0.x; dst.y = src0.x >> 16;")
377
378 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
379 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
380 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
381 """)
382
383 # Lowered floating point unpacking operations.
384
385 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
386 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
387 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
388 "unpack_half_1x16((uint16_t)(src0 >> 16))")
389
390 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
391 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
392 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
393 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
394
395 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
396 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
397
398 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
399 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
400
401 # Bit operations, part of ARB_gpu_shader5.
402
403
404 unop("bitfield_reverse", tuint32, """
405 /* we're not winning any awards for speed here, but that's ok */
406 dst = 0;
407 for (unsigned bit = 0; bit < 32; bit++)
408 dst |= ((src0 >> bit) & 1) << (31 - bit);
409 """)
410 unop_convert("bit_count", tuint32, tuint, """
411 dst = 0;
412 for (unsigned bit = 0; bit < bit_size; bit++) {
413 if ((src0 >> bit) & 1)
414 dst++;
415 }
416 """)
417
418 unop_convert("ufind_msb", tint32, tuint, """
419 dst = -1;
420 for (int bit = bit_size - 1; bit >= 0; bit--) {
421 if ((src0 >> bit) & 1) {
422 dst = bit;
423 break;
424 }
425 }
426 """)
427
428 unop("uclz", tuint32, """
429 int bit;
430 for (bit = bit_size - 1; bit >= 0; bit--) {
431 if ((src0 & (1u << bit)) != 0)
432 break;
433 }
434 dst = (unsigned)(31 - bit);
435 """)
436
437 unop("ifind_msb", tint32, """
438 dst = -1;
439 for (int bit = 31; bit >= 0; bit--) {
440 /* If src0 < 0, we're looking for the first 0 bit.
441 * if src0 >= 0, we're looking for the first 1 bit.
442 */
443 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
444 (!((src0 >> bit) & 1) && (src0 < 0))) {
445 dst = bit;
446 break;
447 }
448 }
449 """)
450
451 unop_convert("find_lsb", tint32, tint, """
452 dst = -1;
453 for (unsigned bit = 0; bit < bit_size; bit++) {
454 if ((src0 >> bit) & 1) {
455 dst = bit;
456 break;
457 }
458 }
459 """)
460
461 # AMD_gcn_shader extended instructions
462 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
463 dst.x = dst.y = 0.0;
464 float absX = fabsf(src0.x);
465 float absY = fabsf(src0.y);
466 float absZ = fabsf(src0.z);
467
468 float ma = 0.0;
469 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
470 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
471 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
472
473 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
474 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
475 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
476 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
477 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
478 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
479
480 dst.x = dst.x / ma + 0.5;
481 dst.y = dst.y / ma + 0.5;
482 """)
483
484 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
485 float absX = fabsf(src0.x);
486 float absY = fabsf(src0.y);
487 float absZ = fabsf(src0.z);
488 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
489 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
490 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
491 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
492 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
493 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
494 """)
495
496 # Sum of vector components
497 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
498
499 def binop_convert(name, out_type, in_type, alg_props, const_expr):
500 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
501 False, alg_props, const_expr)
502
503 def binop(name, ty, alg_props, const_expr):
504 binop_convert(name, ty, ty, alg_props, const_expr)
505
506 def binop_compare(name, ty, alg_props, const_expr):
507 binop_convert(name, tbool1, ty, alg_props, const_expr)
508
509 def binop_compare8(name, ty, alg_props, const_expr):
510 binop_convert(name, tbool8, ty, alg_props, const_expr)
511
512 def binop_compare16(name, ty, alg_props, const_expr):
513 binop_convert(name, tbool16, ty, alg_props, const_expr)
514
515 def binop_compare32(name, ty, alg_props, const_expr):
516 binop_convert(name, tbool32, ty, alg_props, const_expr)
517
518 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
519 binop_compare(name, ty, alg_props, const_expr)
520 binop_compare8(name + "8", ty, alg_props, const_expr)
521 binop_compare16(name + "16", ty, alg_props, const_expr)
522 binop_compare32(name + "32", ty, alg_props, const_expr)
523
524 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
525 src2_type, const_expr):
526 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
527 False, "", const_expr)
528
529 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
530 reduce_expr, final_expr):
531 def final(src):
532 return final_expr.format(src= "(" + src + ")")
533 def reduce_(src0, src1):
534 return reduce_expr.format(src0=src0, src1=src1)
535 def prereduce(src0, src1):
536 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
537 src0 = prereduce("src0.x", "src1.x")
538 src1 = prereduce("src0.y", "src1.y")
539 src2 = prereduce("src0.z", "src1.z")
540 src3 = prereduce("src0.w", "src1.w")
541 opcode(name + "2", output_size, output_type,
542 [2, 2], [src_type, src_type], False, _2src_commutative,
543 final(reduce_(src0, src1)))
544 opcode(name + "3", output_size, output_type,
545 [3, 3], [src_type, src_type], False, _2src_commutative,
546 final(reduce_(reduce_(src0, src1), src2)))
547 opcode(name + "4", output_size, output_type,
548 [4, 4], [src_type, src_type], False, _2src_commutative,
549 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
550
551 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
552 reduce_expr, final_expr):
553 binop_reduce(name, output_size, tbool1, src_type,
554 prereduce_expr, reduce_expr, final_expr)
555 binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
556 prereduce_expr, reduce_expr, final_expr)
557 binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
558 prereduce_expr, reduce_expr, final_expr)
559 binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
560 prereduce_expr, reduce_expr, final_expr)
561
562 binop("fadd", tfloat, _2src_commutative + associative,"""
563 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
564 if (bit_size == 64)
565 dst = _mesa_double_add_rtz(src0, src1);
566 else
567 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
568 } else {
569 dst = src0 + src1;
570 }
571 """)
572 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
573 binop("iadd_sat", tint, _2src_commutative, """
574 src1 > 0 ?
575 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
576 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
577 """)
578 binop("uadd_sat", tuint, _2src_commutative,
579 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
580 binop("isub_sat", tint, "", """
581 src1 < 0 ?
582 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
583 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
584 """)
585 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
586
587 binop("fsub", tfloat, "", """
588 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
589 if (bit_size == 64)
590 dst = _mesa_double_sub_rtz(src0, src1);
591 else
592 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
593 } else {
594 dst = src0 - src1;
595 }
596 """)
597 binop("isub", tint, "", "src0 - src1")
598 binop_convert("uabs_isub", tuint, tint, "", """
599 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
600 : (uint64_t) src0 - (uint64_t) src1
601 """)
602 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
603
604 binop("fmul", tfloat, _2src_commutative + associative, """
605 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
606 if (bit_size == 64)
607 dst = _mesa_double_mul_rtz(src0, src1);
608 else
609 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
610 } else {
611 dst = src0 * src1;
612 }
613 """)
614 # low 32-bits of signed/unsigned integer multiply
615 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
616
617 # Generate 64 bit result from 2 32 bits quantity
618 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
619 "(int64_t)src0 * (int64_t)src1")
620 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
621 "(uint64_t)src0 * (uint64_t)src1")
622
623 # high 32-bits of signed integer multiply
624 binop("imul_high", tint, _2src_commutative, """
625 if (bit_size == 64) {
626 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
627 * extension to work properly. The casts are kind-of annoying but needed
628 * to prevent compiler warnings.
629 */
630 uint32_t src0_u32[4] = {
631 src0,
632 (int64_t)src0 >> 32,
633 (int64_t)src0 >> 63,
634 (int64_t)src0 >> 63,
635 };
636 uint32_t src1_u32[4] = {
637 src1,
638 (int64_t)src1 >> 32,
639 (int64_t)src1 >> 63,
640 (int64_t)src1 >> 63,
641 };
642 uint32_t prod_u32[4];
643 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
644 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
645 } else {
646 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
647 }
648 """)
649
650 # high 32-bits of unsigned integer multiply
651 binop("umul_high", tuint, _2src_commutative, """
652 if (bit_size == 64) {
653 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
654 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
655 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
656 uint32_t prod_u32[4];
657 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
658 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
659 } else {
660 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
661 }
662 """)
663
664 # low 32-bits of unsigned integer multiply
665 binop("umul_low", tuint32, _2src_commutative, """
666 uint64_t mask = (1 << (bit_size / 2)) - 1;
667 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
668 """)
669
670 # Multiply 32-bits with low 16-bits.
671 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
672 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
673
674 binop("fdiv", tfloat, "", "src0 / src1")
675 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
676 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
677
678 # returns a boolean representing the carry resulting from the addition of
679 # the two unsigned arguments.
680
681 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
682
683 # returns a boolean representing the borrow resulting from the subtraction
684 # of the two unsigned arguments.
685
686 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
687
688 # hadd: (a + b) >> 1 (without overflow)
689 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
690 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
691 # = 2 * (x & y) + (x & ~y) + (~x & y)
692 # = ((x & y) << 1) + (x ^ y)
693 #
694 # Since we know that the bottom bit of (x & y) << 1 is zero,
695 #
696 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
697 # = (x & y) + ((x ^ y) >> 1)
698 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
699 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
700
701 # rhadd: (a + b + 1) >> 1 (without overflow)
702 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
703 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
704 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
705 # = ((x | y) << 1) - (x ^ y) + 1
706 #
707 # Since we know that the bottom bit of (x & y) << 1 is zero,
708 #
709 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
710 # = (x | y) - ((x ^ y) >> 1)
711 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
712 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
713
714 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
715
716 # For signed integers, there are several different possible definitions of
717 # "modulus" or "remainder". We follow the conventions used by LLVM and
718 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
719 # operation while the imod opcode implements the more mathematical
720 # "modulus" operation. For details on the difference, see
721 #
722 # http://mathforum.org/library/drmath/view/52343.html
723
724 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
725 binop("imod", tint, "",
726 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
727 " src0 % src1 : src0 % src1 + src1)")
728 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
729 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
730
731 #
732 # Comparisons
733 #
734
735
736 # these integer-aware comparisons return a boolean (0 or ~0)
737
738 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
739 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
740 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
741 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
742 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
743 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
744 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
745 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
746 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
747 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
748
749 # integer-aware GLSL-style comparisons that compare floats and ints
750
751 binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}",
752 "{src0} && {src1}", "{src}")
753 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
754 "{src0} || {src1}", "{src}")
755 binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}",
756 "{src0} && {src1}", "{src}")
757 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
758 "{src0} || {src1}", "{src}")
759
760 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
761
762 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
763 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
764 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
765 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
766
767 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
768 # and false respectively
769
770 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
771 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
772 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
773 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
774
775 # SPIRV shifts are undefined for shift-operands >= bitsize,
776 # but SM5 shifts are defined to use the least significant bits, only
777 # The NIR definition is according to the SM5 specification.
778 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
779 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
780 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
781 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
782 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
783 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
784
785 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
786 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
787 dst = (src0 << (src1 & rotate_mask)) |
788 (src0 >> (-src1 & rotate_mask));
789 """)
790 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
791 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
792 dst = (src0 >> (src1 & rotate_mask)) |
793 (src0 << (-src1 & rotate_mask));
794 """)
795
796 # bitwise logic operators
797 #
798 # These are also used as boolean and, or, xor for hardware supporting
799 # integers.
800
801
802 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
803 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
804 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
805
806
807 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
808 "{src}")
809
810 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
811 "{src0} * {src1}", "{src0} + {src1}", "{src}")
812
813 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
814 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
815 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
816 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
817
818 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
819 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
820 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
821 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
822 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
823 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
824
825 # Saturated vector add for 4 8bit ints.
826 binop("usadd_4x8", tint32, _2src_commutative + associative, """
827 dst = 0;
828 for (int i = 0; i < 32; i += 8) {
829 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
830 }
831 """)
832
833 # Saturated vector subtract for 4 8bit ints.
834 binop("ussub_4x8", tint32, "", """
835 dst = 0;
836 for (int i = 0; i < 32; i += 8) {
837 int src0_chan = (src0 >> i) & 0xff;
838 int src1_chan = (src1 >> i) & 0xff;
839 if (src0_chan > src1_chan)
840 dst |= (src0_chan - src1_chan) << i;
841 }
842 """)
843
844 # vector min for 4 8bit ints.
845 binop("umin_4x8", tint32, _2src_commutative + associative, """
846 dst = 0;
847 for (int i = 0; i < 32; i += 8) {
848 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
849 }
850 """)
851
852 # vector max for 4 8bit ints.
853 binop("umax_4x8", tint32, _2src_commutative + associative, """
854 dst = 0;
855 for (int i = 0; i < 32; i += 8) {
856 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
857 }
858 """)
859
860 # unorm multiply: (a * b) / 255.
861 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
862 dst = 0;
863 for (int i = 0; i < 32; i += 8) {
864 int src0_chan = (src0 >> i) & 0xff;
865 int src1_chan = (src1 >> i) & 0xff;
866 dst |= ((src0_chan * src1_chan) / 255) << i;
867 }
868 """)
869
870 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
871
872 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
873 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
874
875 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
876 "src0 | ((uint64_t)src1 << 32)")
877
878 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
879 "src0 | ((uint32_t)src1 << 16)")
880
881 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
882 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
883 # are from the low five bits of src0 and src1, respectively.
884 binop_convert("bfm", tuint32, tint32, "", """
885 int bits = src0 & 0x1F;
886 int offset = src1 & 0x1F;
887 dst = ((1u << bits) - 1) << offset;
888 """)
889
890 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
891 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
892 /* flush denormals to zero. */
893 if (!isnormal(dst))
894 dst = copysignf(0.0f, src0);
895 """)
896
897 # Combines the first component of each input to make a 2-component vector.
898
899 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
900 dst.x = src0.x;
901 dst.y = src1.x;
902 """)
903
904 # Byte extraction
905 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
906 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
907
908 # Word extraction
909 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
910 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
911
912
913 def triop(name, ty, alg_props, const_expr):
914 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
915 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
916 opcode(name, output_size, tuint,
917 [src1_size, src2_size, src3_size],
918 [tuint, tuint, tuint], False, "", const_expr)
919
920 triop("ffma", tfloat, _2src_commutative, """
921 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
922 if (bit_size == 64)
923 dst = _mesa_double_fma_rtz(src0, src1, src2);
924 else if (bit_size == 32)
925 dst = _mesa_float_fma_rtz(src0, src1, src2);
926 else
927 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
928 } else {
929 if (bit_size == 32)
930 dst = fmaf(src0, src1, src2);
931 else
932 dst = fma(src0, src1, src2);
933 }
934 """)
935
936 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
937
938 # Conditional Select
939 #
940 # A vector conditional select instruction (like ?:, but operating per-
941 # component on vectors). There are two versions, one for floating point
942 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
943
944
945 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
946
947 # 3 way min/max/med
948 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
949 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
950 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
951
952 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
953 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
954 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
955
956 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
957 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
958 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
959
960 opcode("bcsel", 0, tuint, [0, 0, 0],
961 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
962 opcode("b8csel", 0, tuint, [0, 0, 0],
963 [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
964 opcode("b16csel", 0, tuint, [0, 0, 0],
965 [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
966 opcode("b32csel", 0, tuint, [0, 0, 0],
967 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
968
969 # SM5 bfi assembly
970 triop("bfi", tuint32, "", """
971 unsigned mask = src0, insert = src1, base = src2;
972 if (mask == 0) {
973 dst = base;
974 } else {
975 unsigned tmp = mask;
976 while (!(tmp & 1)) {
977 tmp >>= 1;
978 insert <<= 1;
979 }
980 dst = (base & ~mask) | (insert & mask);
981 }
982 """)
983
984
985 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
986
987 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
988 opcode("ubfe", 0, tuint32,
989 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
990 unsigned base = src0;
991 unsigned offset = src1 & 0x1F;
992 unsigned bits = src2 & 0x1F;
993 if (bits == 0) {
994 dst = 0;
995 } else if (offset + bits < 32) {
996 dst = (base << (32 - bits - offset)) >> (32 - bits);
997 } else {
998 dst = base >> offset;
999 }
1000 """)
1001 opcode("ibfe", 0, tint32,
1002 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1003 int base = src0;
1004 unsigned offset = src1 & 0x1F;
1005 unsigned bits = src2 & 0x1F;
1006 if (bits == 0) {
1007 dst = 0;
1008 } else if (offset + bits < 32) {
1009 dst = (base << (32 - bits - offset)) >> (32 - bits);
1010 } else {
1011 dst = base >> offset;
1012 }
1013 """)
1014
1015 # GLSL bitfieldExtract()
1016 opcode("ubitfield_extract", 0, tuint32,
1017 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1018 unsigned base = src0;
1019 int offset = src1, bits = src2;
1020 if (bits == 0) {
1021 dst = 0;
1022 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1023 dst = 0; /* undefined per the spec */
1024 } else {
1025 dst = (base >> offset) & ((1ull << bits) - 1);
1026 }
1027 """)
1028 opcode("ibitfield_extract", 0, tint32,
1029 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1030 int base = src0;
1031 int offset = src1, bits = src2;
1032 if (bits == 0) {
1033 dst = 0;
1034 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1035 dst = 0;
1036 } else {
1037 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1038 }
1039 """)
1040
1041 # Combines the first component of each input to make a 3-component vector.
1042
1043 triop_horiz("vec3", 3, 1, 1, 1, """
1044 dst.x = src0.x;
1045 dst.y = src1.x;
1046 dst.z = src2.x;
1047 """)
1048
1049 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1050 src4_size, const_expr):
1051 opcode(name, output_size, tuint,
1052 [src1_size, src2_size, src3_size, src4_size],
1053 [tuint, tuint, tuint, tuint],
1054 False, "", const_expr)
1055
1056 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1057 [tuint32, tuint32, tint32, tint32], False, "", """
1058 unsigned base = src0, insert = src1;
1059 int offset = src2, bits = src3;
1060 if (bits == 0) {
1061 dst = base;
1062 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1063 dst = 0;
1064 } else {
1065 unsigned mask = ((1ull << bits) - 1) << offset;
1066 dst = (base & ~mask) | ((insert << offset) & mask);
1067 }
1068 """)
1069
1070 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1071 dst.x = src0.x;
1072 dst.y = src1.x;
1073 dst.z = src2.x;
1074 dst.w = src3.x;
1075 """)
1076
1077 opcode("vec8", 8, tuint,
1078 [1] * 8, [tuint] * 8,
1079 False, "", """
1080 dst.x = src0.x;
1081 dst.y = src1.x;
1082 dst.z = src2.x;
1083 dst.w = src3.x;
1084 dst.e = src4.x;
1085 dst.f = src5.x;
1086 dst.g = src6.x;
1087 dst.h = src7.x;
1088 """)
1089
1090 opcode("vec16", 16, tuint,
1091 [1] * 16, [tuint] * 16,
1092 False, "", """
1093 dst.x = src0.x;
1094 dst.y = src1.x;
1095 dst.z = src2.x;
1096 dst.w = src3.x;
1097 dst.e = src4.x;
1098 dst.f = src5.x;
1099 dst.g = src6.x;
1100 dst.h = src7.x;
1101 dst.i = src8.x;
1102 dst.j = src9.x;
1103 dst.k = src10.x;
1104 dst.l = src11.x;
1105 dst.m = src12.x;
1106 dst.n = src13.x;
1107 dst.o = src14.x;
1108 dst.p = src15.x;
1109 """)
1110
1111 # An integer multiply instruction for address calculation. This is
1112 # similar to imul, except that the results are undefined in case of
1113 # overflow. Overflow is defined according to the size of the variable
1114 # being dereferenced.
1115 #
1116 # This relaxed definition, compared to imul, allows an optimization
1117 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1118 # sources, such that lower precision integer multiplies can be used.
1119 # This is useful on hw that has 24b or perhaps 16b integer multiply
1120 # instructions.
1121 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1122
1123 # ir3-specific instruction that maps directly to mul-add shift high mix,
1124 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1125 # multiplication (imul) on Freedreno backend..
1126 opcode("imadsh_mix16", 0, tint32,
1127 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1128 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1129 """)
1130
1131 # ir3-specific instruction that maps directly to ir3 mad.s24.
1132 #
1133 # 24b multiply into 32b result (with sign extension) plus 32b int
1134 triop("imad24_ir3", tint32, _2src_commutative,
1135 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1136
1137 # 24b multiply into 32b result (with sign extension)
1138 binop("imul24", tint32, _2src_commutative + associative,
1139 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1140
1141 # unsigned 24b multiply into 32b result plus 32b int
1142 triop("umad24", tuint32, _2src_commutative,
1143 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1144
1145 # unsigned 24b multiply into 32b result uint
1146 binop("umul24", tint32, _2src_commutative + associative,
1147 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")