nir: Add a new ALU nir_op_imad24_ir3
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool32 = "bool32"
99 tuint = "uint"
100 tuint16 = "uint16"
101 tfloat32 = "float32"
102 tint32 = "int32"
103 tuint32 = "uint32"
104 tint64 = "int64"
105 tuint64 = "uint64"
106 tfloat64 = "float64"
107
108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
109
110 def type_has_size(type_):
111 m = _TYPE_SPLIT_RE.match(type_)
112 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
113 return m.group('bits') is not None
114
115 def type_size(type_):
116 m = _TYPE_SPLIT_RE.match(type_)
117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118 assert m.group('bits') is not None, \
119 'NIR type string has no bit size: "{}"'.format(type_)
120 return int(m.group('bits'))
121
122 def type_sizes(type_):
123 if type_has_size(type_):
124 return [type_size(type_)]
125 elif type_ == 'bool':
126 return [1, 32]
127 elif type_ == 'float':
128 return [16, 32, 64]
129 else:
130 return [1, 8, 16, 32, 64]
131
132 def type_base_type(type_):
133 m = _TYPE_SPLIT_RE.match(type_)
134 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
135 return m.group('type')
136
137 # Operation where the first two sources are commutative.
138 #
139 # For 2-source operations, this just mathematical commutativity. Some
140 # 3-source operations, like ffma, are only commutative in the first two
141 # sources.
142 _2src_commutative = "2src_commutative "
143 associative = "associative "
144
145 # global dictionary of opcodes
146 opcodes = {}
147
148 def opcode(name, output_size, output_type, input_sizes, input_types,
149 is_conversion, algebraic_properties, const_expr):
150 assert name not in opcodes
151 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
152 input_types, is_conversion, algebraic_properties,
153 const_expr)
154
155 def unop_convert(name, out_type, in_type, const_expr):
156 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
157
158 def unop(name, ty, const_expr):
159 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
160
161 def unop_horiz(name, output_size, output_type, input_size, input_type,
162 const_expr):
163 opcode(name, output_size, output_type, [input_size], [input_type],
164 False, "", const_expr)
165
166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
167 reduce_expr, final_expr):
168 def prereduce(src):
169 return "(" + prereduce_expr.format(src=src) + ")"
170 def final(src):
171 return final_expr.format(src="(" + src + ")")
172 def reduce_(src0, src1):
173 return reduce_expr.format(src0=src0, src1=src1)
174 src0 = prereduce("src0.x")
175 src1 = prereduce("src0.y")
176 src2 = prereduce("src0.z")
177 src3 = prereduce("src0.w")
178 unop_horiz(name + "2", output_size, output_type, 2, input_type,
179 final(reduce_(src0, src1)))
180 unop_horiz(name + "3", output_size, output_type, 3, input_type,
181 final(reduce_(reduce_(src0, src1), src2)))
182 unop_horiz(name + "4", output_size, output_type, 4, input_type,
183 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
184
185 def unop_numeric_convert(name, out_type, in_type, const_expr):
186 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
187
188 unop("mov", tuint, "src0")
189
190 unop("ineg", tint, "-src0")
191 unop("fneg", tfloat, "-src0")
192 unop("inot", tint, "~src0") # invert every bit of the integer
193 unop("fsign", tfloat, ("bit_size == 64 ? " +
194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
198 unop("fabs", tfloat, "fabs(src0)")
199 unop("fsat", tfloat, ("bit_size == 64 ? " +
200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205 unop("fexp2", tfloat, "exp2f(src0)")
206 unop("flog2", tfloat, "log2f(src0)")
207
208 # Generate all of the numeric conversion opcodes
209 for src_t in [tint, tuint, tfloat, tbool]:
210 if src_t == tbool:
211 dst_types = [tfloat, tint]
212 elif src_t == tint:
213 dst_types = [tfloat, tint, tbool]
214 elif src_t == tuint:
215 dst_types = [tfloat, tuint]
216 elif src_t == tfloat:
217 dst_types = [tint, tuint, tfloat, tbool]
218
219 for dst_t in dst_types:
220 for dst_bit_size in type_sizes(dst_t):
221 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
222 rnd_modes = ['_rtne', '_rtz', '']
223 for rnd_mode in rnd_modes:
224 if rnd_mode == '_rtne':
225 conv_expr = """
226 if (bit_size > 16) {
227 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
228 } else {
229 dst = src0;
230 }
231 """
232 elif rnd_mode == '_rtz':
233 conv_expr = """
234 if (bit_size > 16) {
235 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
236 } else {
237 dst = src0;
238 }
239 """
240 else:
241 conv_expr = "src0"
242
243 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
244 dst_t[0],
245 dst_bit_size,
246 rnd_mode),
247 dst_t + str(dst_bit_size),
248 src_t, conv_expr)
249 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
250 conv_expr = """
251 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
252 dst = _mesa_double_to_float_rtz(src0);
253 } else {
254 dst = src0;
255 }
256 """
257 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
258 dst_bit_size),
259 dst_t + str(dst_bit_size), src_t, conv_expr)
260 else:
261 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
262 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
263 dst_bit_size),
264 dst_t + str(dst_bit_size), src_t, conv_expr)
265
266
267 # Unary floating-point rounding operations.
268
269
270 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
271 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
272 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
273 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
274 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
275
276 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
277
278 # Trigonometric operations.
279
280
281 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
282 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
283
284 # dfrexp
285 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
286 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
287
288 # Partial derivatives.
289
290
291 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
292 unop("fddy", tfloat, "0.0")
293 unop("fddx_fine", tfloat, "0.0")
294 unop("fddy_fine", tfloat, "0.0")
295 unop("fddx_coarse", tfloat, "0.0")
296 unop("fddy_coarse", tfloat, "0.0")
297
298
299 # Floating point pack and unpack operations.
300
301 def pack_2x16(fmt):
302 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
303 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
304 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
305 """.replace("fmt", fmt))
306
307 def pack_4x8(fmt):
308 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
309 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
310 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
311 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
313 """.replace("fmt", fmt))
314
315 def unpack_2x16(fmt):
316 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
317 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
318 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
319 """.replace("fmt", fmt))
320
321 def unpack_4x8(fmt):
322 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
323 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
324 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
325 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
326 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
327 """.replace("fmt", fmt))
328
329
330 pack_2x16("snorm")
331 pack_4x8("snorm")
332 pack_2x16("unorm")
333 pack_4x8("unorm")
334 pack_2x16("half")
335 unpack_2x16("snorm")
336 unpack_4x8("snorm")
337 unpack_2x16("unorm")
338 unpack_4x8("unorm")
339 unpack_2x16("half")
340
341 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
342 dst.x = (src0.x & 0xffff) | (src0.y << 16);
343 """)
344
345 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
346 dst.x = (src0.x << 0) |
347 (src0.y << 8) |
348 (src0.z << 16) |
349 (src0.w << 24);
350 """)
351
352 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
353 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
354
355 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
356 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
357
358 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
359 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
360
361 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
362 "dst.x = src0.x; dst.y = src0.x >> 32;")
363
364 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
365 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
366
367 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
368 "dst.x = src0.x; dst.y = src0.x >> 16;")
369
370 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
371 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
372 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
373 """)
374
375 # Lowered floating point unpacking operations.
376
377 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
378 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
379 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
380 "unpack_half_1x16((uint16_t)(src0 >> 16))")
381
382 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
383 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
384 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
385 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
386
387 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
388 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
389
390 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
391 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
392
393 # Bit operations, part of ARB_gpu_shader5.
394
395
396 unop("bitfield_reverse", tuint32, """
397 /* we're not winning any awards for speed here, but that's ok */
398 dst = 0;
399 for (unsigned bit = 0; bit < 32; bit++)
400 dst |= ((src0 >> bit) & 1) << (31 - bit);
401 """)
402 unop_convert("bit_count", tuint32, tuint, """
403 dst = 0;
404 for (unsigned bit = 0; bit < bit_size; bit++) {
405 if ((src0 >> bit) & 1)
406 dst++;
407 }
408 """)
409
410 unop_convert("ufind_msb", tint32, tuint, """
411 dst = -1;
412 for (int bit = bit_size - 1; bit >= 0; bit--) {
413 if ((src0 >> bit) & 1) {
414 dst = bit;
415 break;
416 }
417 }
418 """)
419
420 unop("ifind_msb", tint32, """
421 dst = -1;
422 for (int bit = 31; bit >= 0; bit--) {
423 /* If src0 < 0, we're looking for the first 0 bit.
424 * if src0 >= 0, we're looking for the first 1 bit.
425 */
426 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
427 (!((src0 >> bit) & 1) && (src0 < 0))) {
428 dst = bit;
429 break;
430 }
431 }
432 """)
433
434 unop_convert("find_lsb", tint32, tint, """
435 dst = -1;
436 for (unsigned bit = 0; bit < bit_size; bit++) {
437 if ((src0 >> bit) & 1) {
438 dst = bit;
439 break;
440 }
441 }
442 """)
443
444
445 for i in range(1, 5):
446 for j in range(1, 5):
447 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
448
449
450 # AMD_gcn_shader extended instructions
451 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
452 dst.x = dst.y = 0.0;
453 float absX = fabs(src0.x);
454 float absY = fabs(src0.y);
455 float absZ = fabs(src0.z);
456
457 float ma = 0.0;
458 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
459 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
460 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
461
462 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
463 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
464 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
465 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
466 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
467 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
468
469 dst.x = dst.x / ma + 0.5;
470 dst.y = dst.y / ma + 0.5;
471 """)
472
473 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
474 float absX = fabs(src0.x);
475 float absY = fabs(src0.y);
476 float absZ = fabs(src0.z);
477 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
478 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
479 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
480 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
481 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
482 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
483 """)
484
485 # Sum of vector components
486 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
487
488 def binop_convert(name, out_type, in_type, alg_props, const_expr):
489 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
490 False, alg_props, const_expr)
491
492 def binop(name, ty, alg_props, const_expr):
493 binop_convert(name, ty, ty, alg_props, const_expr)
494
495 def binop_compare(name, ty, alg_props, const_expr):
496 binop_convert(name, tbool1, ty, alg_props, const_expr)
497
498 def binop_compare32(name, ty, alg_props, const_expr):
499 binop_convert(name, tbool32, ty, alg_props, const_expr)
500
501 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
502 src2_type, const_expr):
503 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
504 False, "", const_expr)
505
506 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
507 reduce_expr, final_expr):
508 def final(src):
509 return final_expr.format(src= "(" + src + ")")
510 def reduce_(src0, src1):
511 return reduce_expr.format(src0=src0, src1=src1)
512 def prereduce(src0, src1):
513 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
514 src0 = prereduce("src0.x", "src1.x")
515 src1 = prereduce("src0.y", "src1.y")
516 src2 = prereduce("src0.z", "src1.z")
517 src3 = prereduce("src0.w", "src1.w")
518 opcode(name + "2", output_size, output_type,
519 [2, 2], [src_type, src_type], False, _2src_commutative,
520 final(reduce_(src0, src1)))
521 opcode(name + "3", output_size, output_type,
522 [3, 3], [src_type, src_type], False, _2src_commutative,
523 final(reduce_(reduce_(src0, src1), src2)))
524 opcode(name + "4", output_size, output_type,
525 [4, 4], [src_type, src_type], False, _2src_commutative,
526 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
527
528 binop("fadd", tfloat, _2src_commutative + associative,"""
529 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
530 if (bit_size == 64)
531 dst = _mesa_double_add_rtz(src0, src1);
532 else
533 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
534 } else {
535 dst = src0 + src1;
536 }
537 """)
538 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
539 binop("iadd_sat", tint, _2src_commutative, """
540 src1 > 0 ?
541 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
542 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
543 """)
544 binop("uadd_sat", tuint, _2src_commutative,
545 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
546 binop("isub_sat", tint, "", """
547 src1 < 0 ?
548 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
549 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
550 """)
551 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
552
553 binop("fsub", tfloat, "", """
554 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
555 if (bit_size == 64)
556 dst = _mesa_double_sub_rtz(src0, src1);
557 else
558 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
559 } else {
560 dst = src0 - src1;
561 }
562 """)
563 binop("isub", tint, "", "src0 - src1")
564
565 binop("fmul", tfloat, _2src_commutative + associative, """
566 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
567 if (bit_size == 64)
568 dst = _mesa_double_mul_rtz(src0, src1);
569 else
570 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
571 } else {
572 dst = src0 * src1;
573 }
574 """)
575 # low 32-bits of signed/unsigned integer multiply
576 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
577
578 # Generate 64 bit result from 2 32 bits quantity
579 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
580 "(int64_t)src0 * (int64_t)src1")
581 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
582 "(uint64_t)src0 * (uint64_t)src1")
583
584 # high 32-bits of signed integer multiply
585 binop("imul_high", tint, _2src_commutative, """
586 if (bit_size == 64) {
587 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
588 * extension to work properly. The casts are kind-of annoying but needed
589 * to prevent compiler warnings.
590 */
591 uint32_t src0_u32[4] = {
592 src0,
593 (int64_t)src0 >> 32,
594 (int64_t)src0 >> 63,
595 (int64_t)src0 >> 63,
596 };
597 uint32_t src1_u32[4] = {
598 src1,
599 (int64_t)src1 >> 32,
600 (int64_t)src1 >> 63,
601 (int64_t)src1 >> 63,
602 };
603 uint32_t prod_u32[4];
604 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
605 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
606 } else {
607 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
608 }
609 """)
610
611 # high 32-bits of unsigned integer multiply
612 binop("umul_high", tuint, _2src_commutative, """
613 if (bit_size == 64) {
614 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
615 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
616 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
617 uint32_t prod_u32[4];
618 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
619 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
620 } else {
621 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
622 }
623 """)
624
625 # low 32-bits of unsigned integer multiply
626 binop("umul_low", tuint32, _2src_commutative, """
627 uint64_t mask = (1 << (bit_size / 2)) - 1;
628 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
629 """)
630
631
632 binop("fdiv", tfloat, "", "src0 / src1")
633 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
634 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
635
636 # returns a boolean representing the carry resulting from the addition of
637 # the two unsigned arguments.
638
639 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
640
641 # returns a boolean representing the borrow resulting from the subtraction
642 # of the two unsigned arguments.
643
644 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
645
646 # hadd: (a + b) >> 1 (without overflow)
647 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
648 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
649 # = 2 * (x & y) + (x & ~y) + (~x & y)
650 # = ((x & y) << 1) + (x ^ y)
651 #
652 # Since we know that the bottom bit of (x & y) << 1 is zero,
653 #
654 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
655 # = (x & y) + ((x ^ y) >> 1)
656 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
657 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
658
659 # rhadd: (a + b + 1) >> 1 (without overflow)
660 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
661 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
662 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
663 # = ((x | y) << 1) - (x ^ y) + 1
664 #
665 # Since we know that the bottom bit of (x & y) << 1 is zero,
666 #
667 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
668 # = (x | y) - ((x ^ y) >> 1)
669 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
670 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
671
672 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
673
674 # For signed integers, there are several different possible definitions of
675 # "modulus" or "remainder". We follow the conventions used by LLVM and
676 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
677 # operation while the imod opcode implements the more mathematical
678 # "modulus" operation. For details on the difference, see
679 #
680 # http://mathforum.org/library/drmath/view/52343.html
681
682 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
683 binop("imod", tint, "",
684 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
685 " src0 % src1 : src0 % src1 + src1)")
686 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
687 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
688
689 #
690 # Comparisons
691 #
692
693
694 # these integer-aware comparisons return a boolean (0 or ~0)
695
696 binop_compare("flt", tfloat, "", "src0 < src1")
697 binop_compare("fge", tfloat, "", "src0 >= src1")
698 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
699 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
700 binop_compare("ilt", tint, "", "src0 < src1")
701 binop_compare("ige", tint, "", "src0 >= src1")
702 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
703 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
704 binop_compare("ult", tuint, "", "src0 < src1")
705 binop_compare("uge", tuint, "", "src0 >= src1")
706 binop_compare32("flt32", tfloat, "", "src0 < src1")
707 binop_compare32("fge32", tfloat, "", "src0 >= src1")
708 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
709 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
710 binop_compare32("ilt32", tint, "", "src0 < src1")
711 binop_compare32("ige32", tint, "", "src0 >= src1")
712 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
713 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
714 binop_compare32("ult32", tuint, "", "src0 < src1")
715 binop_compare32("uge32", tuint, "", "src0 >= src1")
716
717 # integer-aware GLSL-style comparisons that compare floats and ints
718
719 binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}",
720 "{src0} && {src1}", "{src}")
721 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
722 "{src0} || {src1}", "{src}")
723 binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}",
724 "{src0} && {src1}", "{src}")
725 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
726 "{src0} || {src1}", "{src}")
727
728 binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}",
729 "{src0} && {src1}", "{src}")
730 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
731 "{src0} || {src1}", "{src}")
732 binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}",
733 "{src0} && {src1}", "{src}")
734 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
735 "{src0} || {src1}", "{src}")
736
737 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
738
739 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
740 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
741 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
742 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
743
744 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
745 # and false respectively
746
747 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
748 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
749 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
750 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
751
752 # SPIRV shifts are undefined for shift-operands >= bitsize,
753 # but SM5 shifts are defined to use the least significant bits, only
754 # The NIR definition is according to the SM5 specification.
755 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
756 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
757 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
758 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
759 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
760 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
761
762 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
763 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
764 dst = (src0 << (src1 & rotate_mask)) |
765 (src0 >> (-src1 & rotate_mask));
766 """)
767 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
768 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
769 dst = (src0 >> (src1 & rotate_mask)) |
770 (src0 << (-src1 & rotate_mask));
771 """)
772
773 # bitwise logic operators
774 #
775 # These are also used as boolean and, or, xor for hardware supporting
776 # integers.
777
778
779 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
780 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
781 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
782
783
784 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
785 "{src}")
786
787 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
788 "{src0} * {src1}", "{src0} + {src1}", "{src}")
789
790 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
791 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
792 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
793 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
794
795 binop("fmin", tfloat, "", "fmin(src0, src1)")
796 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
797 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
798 binop("fmax", tfloat, "", "fmax(src0, src1)")
799 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
800 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
801
802 # Saturated vector add for 4 8bit ints.
803 binop("usadd_4x8", tint32, _2src_commutative + associative, """
804 dst = 0;
805 for (int i = 0; i < 32; i += 8) {
806 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
807 }
808 """)
809
810 # Saturated vector subtract for 4 8bit ints.
811 binop("ussub_4x8", tint32, "", """
812 dst = 0;
813 for (int i = 0; i < 32; i += 8) {
814 int src0_chan = (src0 >> i) & 0xff;
815 int src1_chan = (src1 >> i) & 0xff;
816 if (src0_chan > src1_chan)
817 dst |= (src0_chan - src1_chan) << i;
818 }
819 """)
820
821 # vector min for 4 8bit ints.
822 binop("umin_4x8", tint32, _2src_commutative + associative, """
823 dst = 0;
824 for (int i = 0; i < 32; i += 8) {
825 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
826 }
827 """)
828
829 # vector max for 4 8bit ints.
830 binop("umax_4x8", tint32, _2src_commutative + associative, """
831 dst = 0;
832 for (int i = 0; i < 32; i += 8) {
833 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
834 }
835 """)
836
837 # unorm multiply: (a * b) / 255.
838 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
839 dst = 0;
840 for (int i = 0; i < 32; i += 8) {
841 int src0_chan = (src0 >> i) & 0xff;
842 int src1_chan = (src1 >> i) & 0xff;
843 dst |= ((src0_chan * src1_chan) / 255) << i;
844 }
845 """)
846
847 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
848
849 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
850 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
851
852 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
853 "src0 | ((uint64_t)src1 << 32)")
854
855 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
856 "src0 | ((uint32_t)src1 << 16)")
857
858 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
859 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
860 # are from the low five bits of src0 and src1, respectively.
861 binop_convert("bfm", tuint32, tint32, "", """
862 int bits = src0 & 0x1F;
863 int offset = src1 & 0x1F;
864 dst = ((1u << bits) - 1) << offset;
865 """)
866
867 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
868 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
869 /* flush denormals to zero. */
870 if (!isnormal(dst))
871 dst = copysignf(0.0f, src0);
872 """)
873
874 # Combines the first component of each input to make a 2-component vector.
875
876 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
877 dst.x = src0.x;
878 dst.y = src1.x;
879 """)
880
881 # Byte extraction
882 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
883 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
884
885 # Word extraction
886 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
887 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
888
889
890 def triop(name, ty, alg_props, const_expr):
891 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
892 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
893 opcode(name, output_size, tuint,
894 [src1_size, src2_size, src3_size],
895 [tuint, tuint, tuint], False, "", const_expr)
896
897 triop("ffma", tfloat, _2src_commutative, """
898 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
899 if (bit_size == 64)
900 dst = _mesa_double_fma_rtz(src0, src1, src2);
901 else if (bit_size == 32)
902 dst = _mesa_float_fma_rtz(src0, src1, src2);
903 else
904 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
905 } else {
906 if (bit_size == 32)
907 dst = fmaf(src0, src1, src2);
908 else
909 dst = fma(src0, src1, src2);
910 }
911 """)
912
913 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
914
915 # Conditional Select
916 #
917 # A vector conditional select instruction (like ?:, but operating per-
918 # component on vectors). There are two versions, one for floating point
919 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
920
921
922 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
923
924 # 3 way min/max/med
925 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
926 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
927 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
928
929 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
930 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
931 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
932
933 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
934 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
935 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
936
937 opcode("bcsel", 0, tuint, [0, 0, 0],
938 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
939 opcode("b32csel", 0, tuint, [0, 0, 0],
940 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
941
942 # SM5 bfi assembly
943 triop("bfi", tuint32, "", """
944 unsigned mask = src0, insert = src1, base = src2;
945 if (mask == 0) {
946 dst = base;
947 } else {
948 unsigned tmp = mask;
949 while (!(tmp & 1)) {
950 tmp >>= 1;
951 insert <<= 1;
952 }
953 dst = (base & ~mask) | (insert & mask);
954 }
955 """)
956
957
958 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
959
960 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
961 opcode("ubfe", 0, tuint32,
962 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
963 unsigned base = src0;
964 unsigned offset = src1 & 0x1F;
965 unsigned bits = src2 & 0x1F;
966 if (bits == 0) {
967 dst = 0;
968 } else if (offset + bits < 32) {
969 dst = (base << (32 - bits - offset)) >> (32 - bits);
970 } else {
971 dst = base >> offset;
972 }
973 """)
974 opcode("ibfe", 0, tint32,
975 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
976 int base = src0;
977 unsigned offset = src1 & 0x1F;
978 unsigned bits = src2 & 0x1F;
979 if (bits == 0) {
980 dst = 0;
981 } else if (offset + bits < 32) {
982 dst = (base << (32 - bits - offset)) >> (32 - bits);
983 } else {
984 dst = base >> offset;
985 }
986 """)
987
988 # GLSL bitfieldExtract()
989 opcode("ubitfield_extract", 0, tuint32,
990 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
991 unsigned base = src0;
992 int offset = src1, bits = src2;
993 if (bits == 0) {
994 dst = 0;
995 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
996 dst = 0; /* undefined per the spec */
997 } else {
998 dst = (base >> offset) & ((1ull << bits) - 1);
999 }
1000 """)
1001 opcode("ibitfield_extract", 0, tint32,
1002 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1003 int base = src0;
1004 int offset = src1, bits = src2;
1005 if (bits == 0) {
1006 dst = 0;
1007 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1008 dst = 0;
1009 } else {
1010 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1011 }
1012 """)
1013
1014 # Combines the first component of each input to make a 3-component vector.
1015
1016 triop_horiz("vec3", 3, 1, 1, 1, """
1017 dst.x = src0.x;
1018 dst.y = src1.x;
1019 dst.z = src2.x;
1020 """)
1021
1022 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1023 src4_size, const_expr):
1024 opcode(name, output_size, tuint,
1025 [src1_size, src2_size, src3_size, src4_size],
1026 [tuint, tuint, tuint, tuint],
1027 False, "", const_expr)
1028
1029 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1030 [tuint32, tuint32, tint32, tint32], False, "", """
1031 unsigned base = src0, insert = src1;
1032 int offset = src2, bits = src3;
1033 if (bits == 0) {
1034 dst = base;
1035 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1036 dst = 0;
1037 } else {
1038 unsigned mask = ((1ull << bits) - 1) << offset;
1039 dst = (base & ~mask) | ((insert << offset) & mask);
1040 }
1041 """)
1042
1043 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1044 dst.x = src0.x;
1045 dst.y = src1.x;
1046 dst.z = src2.x;
1047 dst.w = src3.x;
1048 """)
1049
1050 # ir3-specific instruction that maps directly to mul-add shift high mix,
1051 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1052 # multiplication (imul) on Freedreno backend..
1053 opcode("imadsh_mix16", 1, tint32,
1054 [1, 1, 1], [tint32, tint32, tint32], False, "", """
1055 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1056 """)
1057
1058 # ir3-specific instruction that maps directly to ir3 mad.s24.
1059 #
1060 # 24b multiply into 32b result (with sign extension) plus 32b int
1061 triop("imad24_ir3", tint32, _2src_commutative,
1062 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1063