nir: make fsat return 0.0 with NaN instead of passing it through
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool8 = "bool8"
99 tbool16 = "bool16"
100 tbool32 = "bool32"
101 tuint = "uint"
102 tuint16 = "uint16"
103 tfloat16 = "float16"
104 tfloat32 = "float32"
105 tint32 = "int32"
106 tuint32 = "uint32"
107 tint64 = "int64"
108 tuint64 = "uint64"
109 tfloat64 = "float64"
110
111 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
112
113 def type_has_size(type_):
114 m = _TYPE_SPLIT_RE.match(type_)
115 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
116 return m.group('bits') is not None
117
118 def type_size(type_):
119 m = _TYPE_SPLIT_RE.match(type_)
120 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
121 assert m.group('bits') is not None, \
122 'NIR type string has no bit size: "{}"'.format(type_)
123 return int(m.group('bits'))
124
125 def type_sizes(type_):
126 if type_has_size(type_):
127 return [type_size(type_)]
128 elif type_ == 'bool':
129 return [1, 8, 16, 32]
130 elif type_ == 'float':
131 return [16, 32, 64]
132 else:
133 return [1, 8, 16, 32, 64]
134
135 def type_base_type(type_):
136 m = _TYPE_SPLIT_RE.match(type_)
137 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
138 return m.group('type')
139
140 # Operation where the first two sources are commutative.
141 #
142 # For 2-source operations, this just mathematical commutativity. Some
143 # 3-source operations, like ffma, are only commutative in the first two
144 # sources.
145 _2src_commutative = "2src_commutative "
146 associative = "associative "
147
148 # global dictionary of opcodes
149 opcodes = {}
150
151 def opcode(name, output_size, output_type, input_sizes, input_types,
152 is_conversion, algebraic_properties, const_expr):
153 assert name not in opcodes
154 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
155 input_types, is_conversion, algebraic_properties,
156 const_expr)
157
158 def unop_convert(name, out_type, in_type, const_expr):
159 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
160
161 def unop(name, ty, const_expr):
162 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
163
164 def unop_horiz(name, output_size, output_type, input_size, input_type,
165 const_expr):
166 opcode(name, output_size, output_type, [input_size], [input_type],
167 False, "", const_expr)
168
169 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
170 reduce_expr, final_expr):
171 def prereduce(src):
172 return "(" + prereduce_expr.format(src=src) + ")"
173 def final(src):
174 return final_expr.format(src="(" + src + ")")
175 def reduce_(src0, src1):
176 return reduce_expr.format(src0=src0, src1=src1)
177 src0 = prereduce("src0.x")
178 src1 = prereduce("src0.y")
179 src2 = prereduce("src0.z")
180 src3 = prereduce("src0.w")
181 unop_horiz(name + "2", output_size, output_type, 2, input_type,
182 final(reduce_(src0, src1)))
183 unop_horiz(name + "3", output_size, output_type, 3, input_type,
184 final(reduce_(reduce_(src0, src1), src2)))
185 unop_horiz(name + "4", output_size, output_type, 4, input_type,
186 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
187
188 def unop_numeric_convert(name, out_type, in_type, const_expr):
189 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
190
191 unop("mov", tuint, "src0")
192
193 unop("ineg", tint, "-src0")
194 unop("fneg", tfloat, "-src0")
195 unop("inot", tint, "~src0") # invert every bit of the integer
196 unop("fsign", tfloat, ("bit_size == 64 ? " +
197 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
198 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
199 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
200 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
201 unop("fabs", tfloat, "fabs(src0)")
202 unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
203 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
204 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
205 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
206 unop("fexp2", tfloat, "exp2f(src0)")
207 unop("flog2", tfloat, "log2f(src0)")
208
209 # Generate all of the numeric conversion opcodes
210 for src_t in [tint, tuint, tfloat, tbool]:
211 if src_t == tbool:
212 dst_types = [tfloat, tint, tbool]
213 elif src_t == tint:
214 dst_types = [tfloat, tint, tbool]
215 elif src_t == tuint:
216 dst_types = [tfloat, tuint]
217 elif src_t == tfloat:
218 dst_types = [tint, tuint, tfloat, tbool]
219
220 for dst_t in dst_types:
221 for dst_bit_size in type_sizes(dst_t):
222 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
223 rnd_modes = ['_rtne', '_rtz', '']
224 for rnd_mode in rnd_modes:
225 if rnd_mode == '_rtne':
226 conv_expr = """
227 if (bit_size > 16) {
228 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
229 } else {
230 dst = src0;
231 }
232 """
233 elif rnd_mode == '_rtz':
234 conv_expr = """
235 if (bit_size > 16) {
236 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
237 } else {
238 dst = src0;
239 }
240 """
241 else:
242 conv_expr = "src0"
243
244 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
245 dst_t[0],
246 dst_bit_size,
247 rnd_mode),
248 dst_t + str(dst_bit_size),
249 src_t, conv_expr)
250 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
251 conv_expr = """
252 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
253 dst = _mesa_double_to_float_rtz(src0);
254 } else {
255 dst = src0;
256 }
257 """
258 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
259 dst_bit_size),
260 dst_t + str(dst_bit_size), src_t, conv_expr)
261 else:
262 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
263 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
264 dst_bit_size),
265 dst_t + str(dst_bit_size), src_t, conv_expr)
266
267 # Special opcode that is the same as f2f16 except that it is safe to remove it
268 # if the result is immediately converted back to float32 again. This is
269 # generated as part of the precision lowering pass. mp stands for medium
270 # precision.
271 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
272
273 # Unary floating-point rounding operations.
274
275
276 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
277 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
278 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
279 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
280 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
281
282 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
283
284 # Trigonometric operations.
285
286
287 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
288 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
289
290 # dfrexp
291 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
292 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
293
294 # Partial derivatives.
295
296
297 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
298 unop("fddy", tfloat, "0.0")
299 unop("fddx_fine", tfloat, "0.0")
300 unop("fddy_fine", tfloat, "0.0")
301 unop("fddx_coarse", tfloat, "0.0")
302 unop("fddy_coarse", tfloat, "0.0")
303
304
305 # Floating point pack and unpack operations.
306
307 def pack_2x16(fmt):
308 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
309 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
310 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
311 """.replace("fmt", fmt))
312
313 def pack_4x8(fmt):
314 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
315 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
316 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
317 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
318 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
319 """.replace("fmt", fmt))
320
321 def unpack_2x16(fmt):
322 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
323 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
324 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
325 """.replace("fmt", fmt))
326
327 def unpack_4x8(fmt):
328 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
329 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
330 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
331 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
332 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
333 """.replace("fmt", fmt))
334
335
336 pack_2x16("snorm")
337 pack_4x8("snorm")
338 pack_2x16("unorm")
339 pack_4x8("unorm")
340 pack_2x16("half")
341 unpack_2x16("snorm")
342 unpack_4x8("snorm")
343 unpack_2x16("unorm")
344 unpack_4x8("unorm")
345 unpack_2x16("half")
346
347 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
348 dst.x = (src0.x & 0xffff) | (src0.y << 16);
349 """)
350
351 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
352 dst.x = (src0.x << 0) |
353 (src0.y << 8) |
354 (src0.z << 16) |
355 (src0.w << 24);
356 """)
357
358 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
359 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
360
361 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
362 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
363
364 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
365 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
366
367 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
368 "dst.x = src0.x; dst.y = src0.x >> 32;")
369
370 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
371 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
372
373 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
374 "dst.x = src0.x; dst.y = src0.x >> 16;")
375
376 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
377 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
378 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
379 """)
380
381 # Lowered floating point unpacking operations.
382
383 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
384 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
385 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
386 "unpack_half_1x16((uint16_t)(src0 >> 16))")
387
388 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
389 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
390 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
391 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
392
393 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
394 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
395
396 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
397 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
398
399 # Bit operations, part of ARB_gpu_shader5.
400
401
402 unop("bitfield_reverse", tuint32, """
403 /* we're not winning any awards for speed here, but that's ok */
404 dst = 0;
405 for (unsigned bit = 0; bit < 32; bit++)
406 dst |= ((src0 >> bit) & 1) << (31 - bit);
407 """)
408 unop_convert("bit_count", tuint32, tuint, """
409 dst = 0;
410 for (unsigned bit = 0; bit < bit_size; bit++) {
411 if ((src0 >> bit) & 1)
412 dst++;
413 }
414 """)
415
416 unop_convert("ufind_msb", tint32, tuint, """
417 dst = -1;
418 for (int bit = bit_size - 1; bit >= 0; bit--) {
419 if ((src0 >> bit) & 1) {
420 dst = bit;
421 break;
422 }
423 }
424 """)
425
426 unop("uclz", tuint32, """
427 int bit;
428 for (bit = bit_size - 1; bit >= 0; bit--) {
429 if ((src0 & (1u << bit)) != 0)
430 break;
431 }
432 dst = (unsigned)(31 - bit);
433 """)
434
435 unop("ifind_msb", tint32, """
436 dst = -1;
437 for (int bit = 31; bit >= 0; bit--) {
438 /* If src0 < 0, we're looking for the first 0 bit.
439 * if src0 >= 0, we're looking for the first 1 bit.
440 */
441 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
442 (!((src0 >> bit) & 1) && (src0 < 0))) {
443 dst = bit;
444 break;
445 }
446 }
447 """)
448
449 unop_convert("find_lsb", tint32, tint, """
450 dst = -1;
451 for (unsigned bit = 0; bit < bit_size; bit++) {
452 if ((src0 >> bit) & 1) {
453 dst = bit;
454 break;
455 }
456 }
457 """)
458
459 # AMD_gcn_shader extended instructions
460 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
461 dst.x = dst.y = 0.0;
462 float absX = fabsf(src0.x);
463 float absY = fabsf(src0.y);
464 float absZ = fabsf(src0.z);
465
466 float ma = 0.0;
467 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
468 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
469 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
470
471 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
472 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
473 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
474 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
475 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
476 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
477
478 dst.x = dst.x / ma + 0.5;
479 dst.y = dst.y / ma + 0.5;
480 """)
481
482 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
483 float absX = fabsf(src0.x);
484 float absY = fabsf(src0.y);
485 float absZ = fabsf(src0.z);
486 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
487 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
488 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
489 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
490 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
491 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
492 """)
493
494 # Sum of vector components
495 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
496
497 def binop_convert(name, out_type, in_type, alg_props, const_expr):
498 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
499 False, alg_props, const_expr)
500
501 def binop(name, ty, alg_props, const_expr):
502 binop_convert(name, ty, ty, alg_props, const_expr)
503
504 def binop_compare(name, ty, alg_props, const_expr):
505 binop_convert(name, tbool1, ty, alg_props, const_expr)
506
507 def binop_compare8(name, ty, alg_props, const_expr):
508 binop_convert(name, tbool8, ty, alg_props, const_expr)
509
510 def binop_compare16(name, ty, alg_props, const_expr):
511 binop_convert(name, tbool16, ty, alg_props, const_expr)
512
513 def binop_compare32(name, ty, alg_props, const_expr):
514 binop_convert(name, tbool32, ty, alg_props, const_expr)
515
516 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
517 binop_compare(name, ty, alg_props, const_expr)
518 binop_compare8(name + "8", ty, alg_props, const_expr)
519 binop_compare16(name + "16", ty, alg_props, const_expr)
520 binop_compare32(name + "32", ty, alg_props, const_expr)
521
522 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
523 src2_type, const_expr):
524 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
525 False, "", const_expr)
526
527 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
528 reduce_expr, final_expr):
529 def final(src):
530 return final_expr.format(src= "(" + src + ")")
531 def reduce_(src0, src1):
532 return reduce_expr.format(src0=src0, src1=src1)
533 def prereduce(src0, src1):
534 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
535 src0 = prereduce("src0.x", "src1.x")
536 src1 = prereduce("src0.y", "src1.y")
537 src2 = prereduce("src0.z", "src1.z")
538 src3 = prereduce("src0.w", "src1.w")
539 opcode(name + "2", output_size, output_type,
540 [2, 2], [src_type, src_type], False, _2src_commutative,
541 final(reduce_(src0, src1)))
542 opcode(name + "3", output_size, output_type,
543 [3, 3], [src_type, src_type], False, _2src_commutative,
544 final(reduce_(reduce_(src0, src1), src2)))
545 opcode(name + "4", output_size, output_type,
546 [4, 4], [src_type, src_type], False, _2src_commutative,
547 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
548
549 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
550 reduce_expr, final_expr):
551 binop_reduce(name, output_size, tbool1, src_type,
552 prereduce_expr, reduce_expr, final_expr)
553 binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
554 prereduce_expr, reduce_expr, final_expr)
555 binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
556 prereduce_expr, reduce_expr, final_expr)
557 binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
558 prereduce_expr, reduce_expr, final_expr)
559
560 binop("fadd", tfloat, _2src_commutative + associative,"""
561 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
562 if (bit_size == 64)
563 dst = _mesa_double_add_rtz(src0, src1);
564 else
565 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
566 } else {
567 dst = src0 + src1;
568 }
569 """)
570 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
571 binop("iadd_sat", tint, _2src_commutative, """
572 src1 > 0 ?
573 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
574 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
575 """)
576 binop("uadd_sat", tuint, _2src_commutative,
577 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
578 binop("isub_sat", tint, "", """
579 src1 < 0 ?
580 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
581 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
582 """)
583 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
584
585 binop("fsub", tfloat, "", """
586 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
587 if (bit_size == 64)
588 dst = _mesa_double_sub_rtz(src0, src1);
589 else
590 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
591 } else {
592 dst = src0 - src1;
593 }
594 """)
595 binop("isub", tint, "", "src0 - src1")
596 binop_convert("uabs_isub", tuint, tint, "", """
597 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
598 : (uint64_t) src0 - (uint64_t) src1
599 """)
600 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
601
602 binop("fmul", tfloat, _2src_commutative + associative, """
603 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
604 if (bit_size == 64)
605 dst = _mesa_double_mul_rtz(src0, src1);
606 else
607 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
608 } else {
609 dst = src0 * src1;
610 }
611 """)
612 # low 32-bits of signed/unsigned integer multiply
613 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
614
615 # Generate 64 bit result from 2 32 bits quantity
616 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
617 "(int64_t)src0 * (int64_t)src1")
618 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
619 "(uint64_t)src0 * (uint64_t)src1")
620
621 # high 32-bits of signed integer multiply
622 binop("imul_high", tint, _2src_commutative, """
623 if (bit_size == 64) {
624 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
625 * extension to work properly. The casts are kind-of annoying but needed
626 * to prevent compiler warnings.
627 */
628 uint32_t src0_u32[4] = {
629 src0,
630 (int64_t)src0 >> 32,
631 (int64_t)src0 >> 63,
632 (int64_t)src0 >> 63,
633 };
634 uint32_t src1_u32[4] = {
635 src1,
636 (int64_t)src1 >> 32,
637 (int64_t)src1 >> 63,
638 (int64_t)src1 >> 63,
639 };
640 uint32_t prod_u32[4];
641 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
642 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
643 } else {
644 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
645 }
646 """)
647
648 # high 32-bits of unsigned integer multiply
649 binop("umul_high", tuint, _2src_commutative, """
650 if (bit_size == 64) {
651 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
652 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
653 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
654 uint32_t prod_u32[4];
655 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
656 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
657 } else {
658 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
659 }
660 """)
661
662 # low 32-bits of unsigned integer multiply
663 binop("umul_low", tuint32, _2src_commutative, """
664 uint64_t mask = (1 << (bit_size / 2)) - 1;
665 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
666 """)
667
668 # Multiply 32-bits with low 16-bits.
669 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
670 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
671
672 binop("fdiv", tfloat, "", "src0 / src1")
673 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
674 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
675
676 # returns a boolean representing the carry resulting from the addition of
677 # the two unsigned arguments.
678
679 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
680
681 # returns a boolean representing the borrow resulting from the subtraction
682 # of the two unsigned arguments.
683
684 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
685
686 # hadd: (a + b) >> 1 (without overflow)
687 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
688 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
689 # = 2 * (x & y) + (x & ~y) + (~x & y)
690 # = ((x & y) << 1) + (x ^ y)
691 #
692 # Since we know that the bottom bit of (x & y) << 1 is zero,
693 #
694 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
695 # = (x & y) + ((x ^ y) >> 1)
696 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
697 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
698
699 # rhadd: (a + b + 1) >> 1 (without overflow)
700 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
701 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
702 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
703 # = ((x | y) << 1) - (x ^ y) + 1
704 #
705 # Since we know that the bottom bit of (x & y) << 1 is zero,
706 #
707 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
708 # = (x | y) - ((x ^ y) >> 1)
709 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
710 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
711
712 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
713
714 # For signed integers, there are several different possible definitions of
715 # "modulus" or "remainder". We follow the conventions used by LLVM and
716 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
717 # operation while the imod opcode implements the more mathematical
718 # "modulus" operation. For details on the difference, see
719 #
720 # http://mathforum.org/library/drmath/view/52343.html
721
722 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
723 binop("imod", tint, "",
724 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
725 " src0 % src1 : src0 % src1 + src1)")
726 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
727 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
728
729 #
730 # Comparisons
731 #
732
733
734 # these integer-aware comparisons return a boolean (0 or ~0)
735
736 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
737 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
738 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
739 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
740 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
741 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
742 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
743 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
744 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
745 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
746
747 # integer-aware GLSL-style comparisons that compare floats and ints
748
749 binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}",
750 "{src0} && {src1}", "{src}")
751 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
752 "{src0} || {src1}", "{src}")
753 binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}",
754 "{src0} && {src1}", "{src}")
755 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
756 "{src0} || {src1}", "{src}")
757
758 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
759
760 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
761 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
762 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
763 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
764
765 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
766 # and false respectively
767
768 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
769 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
770 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
771 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
772
773 # SPIRV shifts are undefined for shift-operands >= bitsize,
774 # but SM5 shifts are defined to use the least significant bits, only
775 # The NIR definition is according to the SM5 specification.
776 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
777 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
778 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
779 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
780 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
781 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
782
783 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
784 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
785 dst = (src0 << (src1 & rotate_mask)) |
786 (src0 >> (-src1 & rotate_mask));
787 """)
788 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
789 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
790 dst = (src0 >> (src1 & rotate_mask)) |
791 (src0 << (-src1 & rotate_mask));
792 """)
793
794 # bitwise logic operators
795 #
796 # These are also used as boolean and, or, xor for hardware supporting
797 # integers.
798
799
800 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
801 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
802 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
803
804
805 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
806 "{src}")
807
808 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
809 "{src0} * {src1}", "{src0} + {src1}", "{src}")
810
811 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
812 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
813 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
814 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
815
816 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
817 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
818 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
819 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
820 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
821 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
822
823 # Saturated vector add for 4 8bit ints.
824 binop("usadd_4x8", tint32, _2src_commutative + associative, """
825 dst = 0;
826 for (int i = 0; i < 32; i += 8) {
827 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
828 }
829 """)
830
831 # Saturated vector subtract for 4 8bit ints.
832 binop("ussub_4x8", tint32, "", """
833 dst = 0;
834 for (int i = 0; i < 32; i += 8) {
835 int src0_chan = (src0 >> i) & 0xff;
836 int src1_chan = (src1 >> i) & 0xff;
837 if (src0_chan > src1_chan)
838 dst |= (src0_chan - src1_chan) << i;
839 }
840 """)
841
842 # vector min for 4 8bit ints.
843 binop("umin_4x8", tint32, _2src_commutative + associative, """
844 dst = 0;
845 for (int i = 0; i < 32; i += 8) {
846 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
847 }
848 """)
849
850 # vector max for 4 8bit ints.
851 binop("umax_4x8", tint32, _2src_commutative + associative, """
852 dst = 0;
853 for (int i = 0; i < 32; i += 8) {
854 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
855 }
856 """)
857
858 # unorm multiply: (a * b) / 255.
859 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
860 dst = 0;
861 for (int i = 0; i < 32; i += 8) {
862 int src0_chan = (src0 >> i) & 0xff;
863 int src1_chan = (src1 >> i) & 0xff;
864 dst |= ((src0_chan * src1_chan) / 255) << i;
865 }
866 """)
867
868 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
869
870 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
871 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
872
873 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
874 "src0 | ((uint64_t)src1 << 32)")
875
876 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
877 "src0 | ((uint32_t)src1 << 16)")
878
879 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
880 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
881 # are from the low five bits of src0 and src1, respectively.
882 binop_convert("bfm", tuint32, tint32, "", """
883 int bits = src0 & 0x1F;
884 int offset = src1 & 0x1F;
885 dst = ((1u << bits) - 1) << offset;
886 """)
887
888 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
889 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
890 /* flush denormals to zero. */
891 if (!isnormal(dst))
892 dst = copysignf(0.0f, src0);
893 """)
894
895 # Combines the first component of each input to make a 2-component vector.
896
897 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
898 dst.x = src0.x;
899 dst.y = src1.x;
900 """)
901
902 # Byte extraction
903 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
904 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
905
906 # Word extraction
907 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
908 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
909
910
911 def triop(name, ty, alg_props, const_expr):
912 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
913 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
914 opcode(name, output_size, tuint,
915 [src1_size, src2_size, src3_size],
916 [tuint, tuint, tuint], False, "", const_expr)
917
918 triop("ffma", tfloat, _2src_commutative, """
919 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
920 if (bit_size == 64)
921 dst = _mesa_double_fma_rtz(src0, src1, src2);
922 else if (bit_size == 32)
923 dst = _mesa_float_fma_rtz(src0, src1, src2);
924 else
925 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
926 } else {
927 if (bit_size == 32)
928 dst = fmaf(src0, src1, src2);
929 else
930 dst = fma(src0, src1, src2);
931 }
932 """)
933
934 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
935
936 # Conditional Select
937 #
938 # A vector conditional select instruction (like ?:, but operating per-
939 # component on vectors). There are two versions, one for floating point
940 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
941
942
943 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
944
945 # 3 way min/max/med
946 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
947 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
948 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
949
950 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
951 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
952 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
953
954 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
955 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
956 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
957
958 opcode("bcsel", 0, tuint, [0, 0, 0],
959 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
960 opcode("b8csel", 0, tuint, [0, 0, 0],
961 [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
962 opcode("b16csel", 0, tuint, [0, 0, 0],
963 [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
964 opcode("b32csel", 0, tuint, [0, 0, 0],
965 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
966
967 # SM5 bfi assembly
968 triop("bfi", tuint32, "", """
969 unsigned mask = src0, insert = src1, base = src2;
970 if (mask == 0) {
971 dst = base;
972 } else {
973 unsigned tmp = mask;
974 while (!(tmp & 1)) {
975 tmp >>= 1;
976 insert <<= 1;
977 }
978 dst = (base & ~mask) | (insert & mask);
979 }
980 """)
981
982
983 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
984
985 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
986 opcode("ubfe", 0, tuint32,
987 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
988 unsigned base = src0;
989 unsigned offset = src1 & 0x1F;
990 unsigned bits = src2 & 0x1F;
991 if (bits == 0) {
992 dst = 0;
993 } else if (offset + bits < 32) {
994 dst = (base << (32 - bits - offset)) >> (32 - bits);
995 } else {
996 dst = base >> offset;
997 }
998 """)
999 opcode("ibfe", 0, tint32,
1000 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1001 int base = src0;
1002 unsigned offset = src1 & 0x1F;
1003 unsigned bits = src2 & 0x1F;
1004 if (bits == 0) {
1005 dst = 0;
1006 } else if (offset + bits < 32) {
1007 dst = (base << (32 - bits - offset)) >> (32 - bits);
1008 } else {
1009 dst = base >> offset;
1010 }
1011 """)
1012
1013 # GLSL bitfieldExtract()
1014 opcode("ubitfield_extract", 0, tuint32,
1015 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1016 unsigned base = src0;
1017 int offset = src1, bits = src2;
1018 if (bits == 0) {
1019 dst = 0;
1020 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1021 dst = 0; /* undefined per the spec */
1022 } else {
1023 dst = (base >> offset) & ((1ull << bits) - 1);
1024 }
1025 """)
1026 opcode("ibitfield_extract", 0, tint32,
1027 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1028 int base = src0;
1029 int offset = src1, bits = src2;
1030 if (bits == 0) {
1031 dst = 0;
1032 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1033 dst = 0;
1034 } else {
1035 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1036 }
1037 """)
1038
1039 # Combines the first component of each input to make a 3-component vector.
1040
1041 triop_horiz("vec3", 3, 1, 1, 1, """
1042 dst.x = src0.x;
1043 dst.y = src1.x;
1044 dst.z = src2.x;
1045 """)
1046
1047 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1048 src4_size, const_expr):
1049 opcode(name, output_size, tuint,
1050 [src1_size, src2_size, src3_size, src4_size],
1051 [tuint, tuint, tuint, tuint],
1052 False, "", const_expr)
1053
1054 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1055 [tuint32, tuint32, tint32, tint32], False, "", """
1056 unsigned base = src0, insert = src1;
1057 int offset = src2, bits = src3;
1058 if (bits == 0) {
1059 dst = base;
1060 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1061 dst = 0;
1062 } else {
1063 unsigned mask = ((1ull << bits) - 1) << offset;
1064 dst = (base & ~mask) | ((insert << offset) & mask);
1065 }
1066 """)
1067
1068 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1069 dst.x = src0.x;
1070 dst.y = src1.x;
1071 dst.z = src2.x;
1072 dst.w = src3.x;
1073 """)
1074
1075 opcode("vec8", 8, tuint,
1076 [1] * 8, [tuint] * 8,
1077 False, "", """
1078 dst.x = src0.x;
1079 dst.y = src1.x;
1080 dst.z = src2.x;
1081 dst.w = src3.x;
1082 dst.e = src4.x;
1083 dst.f = src5.x;
1084 dst.g = src6.x;
1085 dst.h = src7.x;
1086 """)
1087
1088 opcode("vec16", 16, tuint,
1089 [1] * 16, [tuint] * 16,
1090 False, "", """
1091 dst.x = src0.x;
1092 dst.y = src1.x;
1093 dst.z = src2.x;
1094 dst.w = src3.x;
1095 dst.e = src4.x;
1096 dst.f = src5.x;
1097 dst.g = src6.x;
1098 dst.h = src7.x;
1099 dst.i = src8.x;
1100 dst.j = src9.x;
1101 dst.k = src10.x;
1102 dst.l = src11.x;
1103 dst.m = src12.x;
1104 dst.n = src13.x;
1105 dst.o = src14.x;
1106 dst.p = src15.x;
1107 """)
1108
1109 # An integer multiply instruction for address calculation. This is
1110 # similar to imul, except that the results are undefined in case of
1111 # overflow. Overflow is defined according to the size of the variable
1112 # being dereferenced.
1113 #
1114 # This relaxed definition, compared to imul, allows an optimization
1115 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1116 # sources, such that lower precision integer multiplies can be used.
1117 # This is useful on hw that has 24b or perhaps 16b integer multiply
1118 # instructions.
1119 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1120
1121 # ir3-specific instruction that maps directly to mul-add shift high mix,
1122 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1123 # multiplication (imul) on Freedreno backend..
1124 opcode("imadsh_mix16", 0, tint32,
1125 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1126 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1127 """)
1128
1129 # ir3-specific instruction that maps directly to ir3 mad.s24.
1130 #
1131 # 24b multiply into 32b result (with sign extension) plus 32b int
1132 triop("imad24_ir3", tint32, _2src_commutative,
1133 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1134
1135 # 24b multiply into 32b result (with sign extension)
1136 binop("imul24", tint32, _2src_commutative + associative,
1137 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1138
1139 # unsigned 24b multiply into 32b result plus 32b int
1140 triop("umad24", tuint32, _2src_commutative,
1141 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1142
1143 # unsigned 24b multiply into 32b result uint
1144 binop("umul24", tint32, _2src_commutative + associative,
1145 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")