nir: fix fmin/fmax support for doubles
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool32 = "bool32"
99 tuint = "uint"
100 tuint16 = "uint16"
101 tfloat32 = "float32"
102 tint32 = "int32"
103 tuint32 = "uint32"
104 tint64 = "int64"
105 tuint64 = "uint64"
106 tfloat64 = "float64"
107
108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
109
110 def type_has_size(type_):
111 m = _TYPE_SPLIT_RE.match(type_)
112 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
113 return m.group('bits') is not None
114
115 def type_size(type_):
116 m = _TYPE_SPLIT_RE.match(type_)
117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118 assert m.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_)
120 return int(m.group('bits'))
121
122 def type_sizes(type_):
123 if type_has_size(type_):
124 return [type_size(type_)]
125 elif type_ == 'bool':
126 return [1, 32]
127 elif type_ == 'float':
128 return [16, 32, 64]
129 else:
130 return [1, 8, 16, 32, 64]
131
132 def type_base_type(type_):
133 m = _TYPE_SPLIT_RE.match(type_)
134 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
135 return m.group('type')
136
137 # Operation where the first two sources are commutative.
138 #
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
141 # sources.
142 _2src_commutative = "2src_commutative "
143 associative = "associative "
144
145 # global dictionary of opcodes
146 opcodes = {}
147
148 def opcode(name, output_size, output_type, input_sizes, input_types,
149 is_conversion, algebraic_properties, const_expr):
150 assert name not in opcodes
151 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
152 input_types, is_conversion, algebraic_properties,
153 const_expr)
154
155 def unop_convert(name, out_type, in_type, const_expr):
156 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
157
158 def unop(name, ty, const_expr):
159 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
160
161 def unop_horiz(name, output_size, output_type, input_size, input_type,
162 const_expr):
163 opcode(name, output_size, output_type, [input_size], [input_type],
164 False, "", const_expr)
165
166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
167 reduce_expr, final_expr):
168 def prereduce(src):
169 return "(" + prereduce_expr.format(src=src) + ")"
170 def final(src):
171 return final_expr.format(src="(" + src + ")")
172 def reduce_(src0, src1):
173 return reduce_expr.format(src0=src0, src1=src1)
174 src0 = prereduce("src0.x")
175 src1 = prereduce("src0.y")
176 src2 = prereduce("src0.z")
177 src3 = prereduce("src0.w")
178 unop_horiz(name + "2", output_size, output_type, 2, input_type,
179 final(reduce_(src0, src1)))
180 unop_horiz(name + "3", output_size, output_type, 3, input_type,
181 final(reduce_(reduce_(src0, src1), src2)))
182 unop_horiz(name + "4", output_size, output_type, 4, input_type,
183 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
184
185 def unop_numeric_convert(name, out_type, in_type, const_expr):
186 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
187
188 unop("mov", tuint, "src0")
189
190 unop("ineg", tint, "-src0")
191 unop("fneg", tfloat, "-src0")
192 unop("inot", tint, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat, "fabs(src0)")
199 unop("fsat", tfloat, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat, "exp2f(src0)")
206 unop("flog2", tfloat, "log2f(src0)")
207
208 # Generate all of the numeric conversion opcodes
209 for src_t in [tint, tuint, tfloat, tbool]:
210 if src_t == tbool:
211 dst_types = [tfloat, tint]
212 elif src_t == tint:
213 dst_types = [tfloat, tint, tbool]
214 elif src_t == tuint:
215 dst_types = [tfloat, tuint]
216 elif src_t == tfloat:
217 dst_types = [tint, tuint, tfloat, tbool]
218
219 for dst_t in dst_types:
220 for bit_size in type_sizes(dst_t):
221 if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
222 rnd_modes = ['_rtne', '_rtz', '']
223 for rnd_mode in rnd_modes:
224 if rnd_mode == '_rtne':
225 conv_expr = """
226 if (bit_size > 16) {
227 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
228 } else {
229 dst = src0;
230 }
231 """
232 elif rnd_mode == '_rtz':
233 conv_expr = """
234 if (bit_size > 16) {
235 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
236 } else {
237 dst = src0;
238 }
239 """
240 else:
241 conv_expr = "src0"
242
243 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
244 bit_size, rnd_mode),
245 dst_t + str(bit_size), src_t, conv_expr)
246 elif bit_size == 32 and dst_t == tfloat and src_t == tfloat:
247 conv_expr = """
248 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
249 dst = _mesa_double_to_float_rtz(src0);
250 } else {
251 dst = src0;
252 }
253 """
254 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
255 dst_t + str(bit_size), src_t, conv_expr)
256 else:
257 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
258 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
259 dst_t + str(bit_size), src_t, conv_expr)
260
261
262 # Unary floating-point rounding operations.
263
264
265 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
266 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
267 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
268 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
269 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
270
271 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
272
273 # Trigonometric operations.
274
275
276 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
277 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
278
279 # dfrexp
280 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
281 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
282
283 # Partial derivatives.
284
285
286 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
287 unop("fddy", tfloat, "0.0")
288 unop("fddx_fine", tfloat, "0.0")
289 unop("fddy_fine", tfloat, "0.0")
290 unop("fddx_coarse", tfloat, "0.0")
291 unop("fddy_coarse", tfloat, "0.0")
292
293
294 # Floating point pack and unpack operations.
295
296 def pack_2x16(fmt):
297 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
298 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
299 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
300 """.replace("fmt", fmt))
301
302 def pack_4x8(fmt):
303 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
304 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
305 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
306 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
307 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
308 """.replace("fmt", fmt))
309
310 def unpack_2x16(fmt):
311 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
312 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
313 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
314 """.replace("fmt", fmt))
315
316 def unpack_4x8(fmt):
317 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
318 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
319 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
320 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
321 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
322 """.replace("fmt", fmt))
323
324
325 pack_2x16("snorm")
326 pack_4x8("snorm")
327 pack_2x16("unorm")
328 pack_4x8("unorm")
329 pack_2x16("half")
330 unpack_2x16("snorm")
331 unpack_4x8("snorm")
332 unpack_2x16("unorm")
333 unpack_4x8("unorm")
334 unpack_2x16("half")
335
336 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
337 dst.x = (src0.x & 0xffff) | (src0.y << 16);
338 """)
339
340 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
341 dst.x = (src0.x << 0) |
342 (src0.y << 8) |
343 (src0.z << 16) |
344 (src0.w << 24);
345 """)
346
347 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
348 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
349
350 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
351 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
352
353 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
354 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
355
356 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
357 "dst.x = src0.x; dst.y = src0.x >> 32;")
358
359 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
360 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
361
362 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
363 "dst.x = src0.x; dst.y = src0.x >> 16;")
364
365 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
366 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
367 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
368 """)
369
370 # Lowered floating point unpacking operations.
371
372 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
373 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
374 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
375 "unpack_half_1x16((uint16_t)(src0 >> 16))")
376
377 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
378 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
379 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
380 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
381
382 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
383 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
384
385 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
386 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
387
388 # Bit operations, part of ARB_gpu_shader5.
389
390
391 unop("bitfield_reverse", tuint32, """
392 /* we're not winning any awards for speed here, but that's ok */
393 dst = 0;
394 for (unsigned bit = 0; bit < 32; bit++)
395 dst |= ((src0 >> bit) & 1) << (31 - bit);
396 """)
397 unop_convert("bit_count", tuint32, tuint, """
398 dst = 0;
399 for (unsigned bit = 0; bit < bit_size; bit++) {
400 if ((src0 >> bit) & 1)
401 dst++;
402 }
403 """)
404
405 unop_convert("ufind_msb", tint32, tuint, """
406 dst = -1;
407 for (int bit = bit_size - 1; bit >= 0; bit--) {
408 if ((src0 >> bit) & 1) {
409 dst = bit;
410 break;
411 }
412 }
413 """)
414
415 unop("ifind_msb", tint32, """
416 dst = -1;
417 for (int bit = 31; bit >= 0; bit--) {
418 /* If src0 < 0, we're looking for the first 0 bit.
419 * if src0 >= 0, we're looking for the first 1 bit.
420 */
421 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
422 (!((src0 >> bit) & 1) && (src0 < 0))) {
423 dst = bit;
424 break;
425 }
426 }
427 """)
428
429 unop_convert("find_lsb", tint32, tint, """
430 dst = -1;
431 for (unsigned bit = 0; bit < bit_size; bit++) {
432 if ((src0 >> bit) & 1) {
433 dst = bit;
434 break;
435 }
436 }
437 """)
438
439
440 for i in range(1, 5):
441 for j in range(1, 5):
442 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
443
444
445 # AMD_gcn_shader extended instructions
446 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
447 dst.x = dst.y = 0.0;
448 float absX = fabs(src0.x);
449 float absY = fabs(src0.y);
450 float absZ = fabs(src0.z);
451
452 float ma = 0.0;
453 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
454 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
455 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
456
457 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
458 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
459 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
460 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
461 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
462 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
463
464 dst.x = dst.x / ma + 0.5;
465 dst.y = dst.y / ma + 0.5;
466 """)
467
468 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
469 float absX = fabs(src0.x);
470 float absY = fabs(src0.y);
471 float absZ = fabs(src0.z);
472 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
473 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
474 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
475 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
476 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
477 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
478 """)
479
480 # Sum of vector components
481 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
482
483 def binop_convert(name, out_type, in_type, alg_props, const_expr):
484 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
485 False, alg_props, const_expr)
486
487 def binop(name, ty, alg_props, const_expr):
488 binop_convert(name, ty, ty, alg_props, const_expr)
489
490 def binop_compare(name, ty, alg_props, const_expr):
491 binop_convert(name, tbool1, ty, alg_props, const_expr)
492
493 def binop_compare32(name, ty, alg_props, const_expr):
494 binop_convert(name, tbool32, ty, alg_props, const_expr)
495
496 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
497 src2_type, const_expr):
498 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
499 False, "", const_expr)
500
501 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
502 reduce_expr, final_expr):
503 def final(src):
504 return final_expr.format(src= "(" + src + ")")
505 def reduce_(src0, src1):
506 return reduce_expr.format(src0=src0, src1=src1)
507 def prereduce(src0, src1):
508 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
509 src0 = prereduce("src0.x", "src1.x")
510 src1 = prereduce("src0.y", "src1.y")
511 src2 = prereduce("src0.z", "src1.z")
512 src3 = prereduce("src0.w", "src1.w")
513 opcode(name + "2", output_size, output_type,
514 [2, 2], [src_type, src_type], False, _2src_commutative,
515 final(reduce_(src0, src1)))
516 opcode(name + "3", output_size, output_type,
517 [3, 3], [src_type, src_type], False, _2src_commutative,
518 final(reduce_(reduce_(src0, src1), src2)))
519 opcode(name + "4", output_size, output_type,
520 [4, 4], [src_type, src_type], False, _2src_commutative,
521 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
522
523 binop("fadd", tfloat, _2src_commutative + associative,"""
524 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
525 if (bit_size == 64)
526 dst = _mesa_double_add_rtz(src0, src1);
527 else
528 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
529 } else {
530 dst = src0 + src1;
531 }
532 """)
533 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
534 binop("iadd_sat", tint, _2src_commutative, """
535 src1 > 0 ?
536 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
537 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
538 """)
539 binop("uadd_sat", tuint, _2src_commutative,
540 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
541 binop("isub_sat", tint, "", """
542 src1 < 0 ?
543 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
544 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
545 """)
546 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
547
548 binop("fsub", tfloat, "", """
549 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
550 if (bit_size == 64)
551 dst = _mesa_double_sub_rtz(src0, src1);
552 else
553 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
554 } else {
555 dst = src0 - src1;
556 }
557 """)
558 binop("isub", tint, "", "src0 - src1")
559
560 binop("fmul", tfloat, _2src_commutative + associative, """
561 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
562 if (bit_size == 64)
563 dst = _mesa_double_mul_rtz(src0, src1);
564 else
565 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
566 } else {
567 dst = src0 * src1;
568 }
569 """)
570 # low 32-bits of signed/unsigned integer multiply
571 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
572
573 # Generate 64 bit result from 2 32 bits quantity
574 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
575 "(int64_t)src0 * (int64_t)src1")
576 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
577 "(uint64_t)src0 * (uint64_t)src1")
578
579 # high 32-bits of signed integer multiply
580 binop("imul_high", tint, _2src_commutative, """
581 if (bit_size == 64) {
582 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
583 * extension to work properly. The casts are kind-of annoying but needed
584 * to prevent compiler warnings.
585 */
586 uint32_t src0_u32[4] = {
587 src0,
588 (int64_t)src0 >> 32,
589 (int64_t)src0 >> 63,
590 (int64_t)src0 >> 63,
591 };
592 uint32_t src1_u32[4] = {
593 src1,
594 (int64_t)src1 >> 32,
595 (int64_t)src1 >> 63,
596 (int64_t)src1 >> 63,
597 };
598 uint32_t prod_u32[4];
599 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
600 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
601 } else {
602 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
603 }
604 """)
605
606 # high 32-bits of unsigned integer multiply
607 binop("umul_high", tuint, _2src_commutative, """
608 if (bit_size == 64) {
609 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
610 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
611 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
612 uint32_t prod_u32[4];
613 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
614 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
615 } else {
616 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
617 }
618 """)
619
620 # low 32-bits of unsigned integer multiply
621 binop("umul_low", tuint32, _2src_commutative, """
622 uint64_t mask = (1 << (bit_size / 2)) - 1;
623 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
624 """)
625
626
627 binop("fdiv", tfloat, "", "src0 / src1")
628 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
629 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
630
631 # returns a boolean representing the carry resulting from the addition of
632 # the two unsigned arguments.
633
634 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
635
636 # returns a boolean representing the borrow resulting from the subtraction
637 # of the two unsigned arguments.
638
639 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
640
641 # hadd: (a + b) >> 1 (without overflow)
642 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
643 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
644 # = 2 * (x & y) + (x & ~y) + (~x & y)
645 # = ((x & y) << 1) + (x ^ y)
646 #
647 # Since we know that the bottom bit of (x & y) << 1 is zero,
648 #
649 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
650 # = (x & y) + ((x ^ y) >> 1)
651 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
652 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
653
654 # rhadd: (a + b + 1) >> 1 (without overflow)
655 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
656 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
657 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
658 # = ((x | y) << 1) - (x ^ y) + 1
659 #
660 # Since we know that the bottom bit of (x & y) << 1 is zero,
661 #
662 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
663 # = (x | y) - ((x ^ y) >> 1)
664 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
665 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
666
667 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
668
669 # For signed integers, there are several different possible definitions of
670 # "modulus" or "remainder". We follow the conventions used by LLVM and
671 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
672 # operation while the imod opcode implements the more mathematical
673 # "modulus" operation. For details on the difference, see
674 #
675 # http://mathforum.org/library/drmath/view/52343.html
676
677 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
678 binop("imod", tint, "",
679 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
680 " src0 % src1 : src0 % src1 + src1)")
681 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
682 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
683
684 #
685 # Comparisons
686 #
687
688
689 # these integer-aware comparisons return a boolean (0 or ~0)
690
691 binop_compare("flt", tfloat, "", "src0 < src1")
692 binop_compare("fge", tfloat, "", "src0 >= src1")
693 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
694 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
695 binop_compare("ilt", tint, "", "src0 < src1")
696 binop_compare("ige", tint, "", "src0 >= src1")
697 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
698 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
699 binop_compare("ult", tuint, "", "src0 < src1")
700 binop_compare("uge", tuint, "", "src0 >= src1")
701 binop_compare32("flt32", tfloat, "", "src0 < src1")
702 binop_compare32("fge32", tfloat, "", "src0 >= src1")
703 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
704 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
705 binop_compare32("ilt32", tint, "", "src0 < src1")
706 binop_compare32("ige32", tint, "", "src0 >= src1")
707 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
708 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
709 binop_compare32("ult32", tuint, "", "src0 < src1")
710 binop_compare32("uge32", tuint, "", "src0 >= src1")
711
712 # integer-aware GLSL-style comparisons that compare floats and ints
713
714 binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}",
715 "{src0} && {src1}", "{src}")
716 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
717 "{src0} || {src1}", "{src}")
718 binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}",
719 "{src0} && {src1}", "{src}")
720 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
721 "{src0} || {src1}", "{src}")
722
723 binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}",
724 "{src0} && {src1}", "{src}")
725 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
726 "{src0} || {src1}", "{src}")
727 binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}",
728 "{src0} && {src1}", "{src}")
729 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
730 "{src0} || {src1}", "{src}")
731
732 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
733
734 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
735 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
736 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
737 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
738
739 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
740 # and false respectively
741
742 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
743 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
744 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
745 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
746
747 # SPIRV shifts are undefined for shift-operands >= bitsize,
748 # but SM5 shifts are defined to use the least significant bits, only
749 # The NIR definition is according to the SM5 specification.
750 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
751 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
752 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
753 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
754 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
755 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
756
757 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
758 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
759 dst = (src0 << (src1 & rotate_mask)) |
760 (src0 >> (-src1 & rotate_mask));
761 """)
762 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
763 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
764 dst = (src0 >> (src1 & rotate_mask)) |
765 (src0 << (-src1 & rotate_mask));
766 """)
767
768 # bitwise logic operators
769 #
770 # These are also used as boolean and, or, xor for hardware supporting
771 # integers.
772
773
774 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
775 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
776 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
777
778
779 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
780 "{src}")
781
782 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
783 "{src0} * {src1}", "{src0} + {src1}", "{src}")
784
785 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
786 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
787 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
788 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
789
790 binop("fmin", tfloat, "", "fmin(src0, src1)")
791 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
792 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
793 binop("fmax", tfloat, "", "fmax(src0, src1)")
794 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
795 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
796
797 # Saturated vector add for 4 8bit ints.
798 binop("usadd_4x8", tint32, _2src_commutative + associative, """
799 dst = 0;
800 for (int i = 0; i < 32; i += 8) {
801 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
802 }
803 """)
804
805 # Saturated vector subtract for 4 8bit ints.
806 binop("ussub_4x8", tint32, "", """
807 dst = 0;
808 for (int i = 0; i < 32; i += 8) {
809 int src0_chan = (src0 >> i) & 0xff;
810 int src1_chan = (src1 >> i) & 0xff;
811 if (src0_chan > src1_chan)
812 dst |= (src0_chan - src1_chan) << i;
813 }
814 """)
815
816 # vector min for 4 8bit ints.
817 binop("umin_4x8", tint32, _2src_commutative + associative, """
818 dst = 0;
819 for (int i = 0; i < 32; i += 8) {
820 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
821 }
822 """)
823
824 # vector max for 4 8bit ints.
825 binop("umax_4x8", tint32, _2src_commutative + associative, """
826 dst = 0;
827 for (int i = 0; i < 32; i += 8) {
828 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
829 }
830 """)
831
832 # unorm multiply: (a * b) / 255.
833 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
834 dst = 0;
835 for (int i = 0; i < 32; i += 8) {
836 int src0_chan = (src0 >> i) & 0xff;
837 int src1_chan = (src1 >> i) & 0xff;
838 dst |= ((src0_chan * src1_chan) / 255) << i;
839 }
840 """)
841
842 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
843
844 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
845 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
846
847 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
848 "src0 | ((uint64_t)src1 << 32)")
849
850 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
851 "src0 | ((uint32_t)src1 << 16)")
852
853 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
854 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
855 # are from the low five bits of src0 and src1, respectively.
856 binop_convert("bfm", tuint32, tint32, "", """
857 int bits = src0 & 0x1F;
858 int offset = src1 & 0x1F;
859 dst = ((1u << bits) - 1) << offset;
860 """)
861
862 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
863 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
864 /* flush denormals to zero. */
865 if (!isnormal(dst))
866 dst = copysignf(0.0f, src0);
867 """)
868
869 # Combines the first component of each input to make a 2-component vector.
870
871 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
872 dst.x = src0.x;
873 dst.y = src1.x;
874 """)
875
876 # Byte extraction
877 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
878 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
879
880 # Word extraction
881 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
882 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
883
884
885 def triop(name, ty, alg_props, const_expr):
886 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
887 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
888 opcode(name, output_size, tuint,
889 [src1_size, src2_size, src3_size],
890 [tuint, tuint, tuint], False, "", const_expr)
891
892 triop("ffma", tfloat, _2src_commutative, """
893 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
894 if (bit_size == 64)
895 dst = _mesa_double_fma_rtz(src0, src1, src2);
896 else if (bit_size == 32)
897 dst = _mesa_float_fma_rtz(src0, src1, src2);
898 else
899 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
900 } else {
901 if (bit_size == 32)
902 dst = fmaf(src0, src1, src2);
903 else
904 dst = fma(src0, src1, src2);
905 }
906 """)
907
908 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
909
910 # Conditional Select
911 #
912 # A vector conditional select instruction (like ?:, but operating per-
913 # component on vectors). There are two versions, one for floating point
914 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
915
916
917 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
918
919 # 3 way min/max/med
920 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
921 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
922 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
923
924 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
925 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
926 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
927
928 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
929 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
930 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
931
932 opcode("bcsel", 0, tuint, [0, 0, 0],
933 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
934 opcode("b32csel", 0, tuint, [0, 0, 0],
935 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
936
937 # SM5 bfi assembly
938 triop("bfi", tuint32, "", """
939 unsigned mask = src0, insert = src1, base = src2;
940 if (mask == 0) {
941 dst = base;
942 } else {
943 unsigned tmp = mask;
944 while (!(tmp & 1)) {
945 tmp >>= 1;
946 insert <<= 1;
947 }
948 dst = (base & ~mask) | (insert & mask);
949 }
950 """)
951
952
953 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
954
955 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
956 opcode("ubfe", 0, tuint32,
957 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
958 unsigned base = src0;
959 unsigned offset = src1 & 0x1F;
960 unsigned bits = src2 & 0x1F;
961 if (bits == 0) {
962 dst = 0;
963 } else if (offset + bits < 32) {
964 dst = (base << (32 - bits - offset)) >> (32 - bits);
965 } else {
966 dst = base >> offset;
967 }
968 """)
969 opcode("ibfe", 0, tint32,
970 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
971 int base = src0;
972 unsigned offset = src1 & 0x1F;
973 unsigned bits = src2 & 0x1F;
974 if (bits == 0) {
975 dst = 0;
976 } else if (offset + bits < 32) {
977 dst = (base << (32 - bits - offset)) >> (32 - bits);
978 } else {
979 dst = base >> offset;
980 }
981 """)
982
983 # GLSL bitfieldExtract()
984 opcode("ubitfield_extract", 0, tuint32,
985 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
986 unsigned base = src0;
987 int offset = src1, bits = src2;
988 if (bits == 0) {
989 dst = 0;
990 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
991 dst = 0; /* undefined per the spec */
992 } else {
993 dst = (base >> offset) & ((1ull << bits) - 1);
994 }
995 """)
996 opcode("ibitfield_extract", 0, tint32,
997 [0, 0, 0], [tint32, tint32, tint32], False, "", """
998 int base = src0;
999 int offset = src1, bits = src2;
1000 if (bits == 0) {
1001 dst = 0;
1002 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1003 dst = 0;
1004 } else {
1005 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1006 }
1007 """)
1008
1009 # Combines the first component of each input to make a 3-component vector.
1010
1011 triop_horiz("vec3", 3, 1, 1, 1, """
1012 dst.x = src0.x;
1013 dst.y = src1.x;
1014 dst.z = src2.x;
1015 """)
1016
1017 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1018 src4_size, const_expr):
1019 opcode(name, output_size, tuint,
1020 [src1_size, src2_size, src3_size, src4_size],
1021 [tuint, tuint, tuint, tuint],
1022 False, "", const_expr)
1023
1024 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1025 [tuint32, tuint32, tint32, tint32], False, "", """
1026 unsigned base = src0, insert = src1;
1027 int offset = src2, bits = src3;
1028 if (bits == 0) {
1029 dst = base;
1030 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1031 dst = 0;
1032 } else {
1033 unsigned mask = ((1ull << bits) - 1) << offset;
1034 dst = (base & ~mask) | ((insert << offset) & mask);
1035 }
1036 """)
1037
1038 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1039 dst.x = src0.x;
1040 dst.y = src1.x;
1041 dst.z = src2.x;
1042 dst.w = src3.x;
1043 """)
1044
1045 # ir3-specific instruction that maps directly to mul-add shift high mix,
1046 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1047 # multiplication (imul) on Freedreno backend..
1048 opcode("imadsh_mix16", 1, tint32,
1049 [1, 1, 1], [tint32, tint32, tint32], False, "", """
1050 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1051 """)