nir: Add a 16-bit bool type
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool16 = "bool16"
99 tbool32 = "bool32"
100 tuint = "uint"
101 tuint16 = "uint16"
102 tfloat32 = "float32"
103 tint32 = "int32"
104 tuint32 = "uint32"
105 tint64 = "int64"
106 tuint64 = "uint64"
107 tfloat64 = "float64"
108
109 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
110
111 def type_has_size(type_):
112 m = _TYPE_SPLIT_RE.match(type_)
113 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
114 return m.group('bits') is not None
115
116 def type_size(type_):
117 m = _TYPE_SPLIT_RE.match(type_)
118 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
119 assert m.group('bits') is not None, \
120 'NIR type string has no bit size: "{}"'.format(type_)
121 return int(m.group('bits'))
122
123 def type_sizes(type_):
124 if type_has_size(type_):
125 return [type_size(type_)]
126 elif type_ == 'bool':
127 return [1, 16, 32]
128 elif type_ == 'float':
129 return [16, 32, 64]
130 else:
131 return [1, 8, 16, 32, 64]
132
133 def type_base_type(type_):
134 m = _TYPE_SPLIT_RE.match(type_)
135 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
136 return m.group('type')
137
138 # Operation where the first two sources are commutative.
139 #
140 # For 2-source operations, this just mathematical commutativity. Some
141 # 3-source operations, like ffma, are only commutative in the first two
142 # sources.
143 _2src_commutative = "2src_commutative "
144 associative = "associative "
145
146 # global dictionary of opcodes
147 opcodes = {}
148
149 def opcode(name, output_size, output_type, input_sizes, input_types,
150 is_conversion, algebraic_properties, const_expr):
151 assert name not in opcodes
152 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
153 input_types, is_conversion, algebraic_properties,
154 const_expr)
155
156 def unop_convert(name, out_type, in_type, const_expr):
157 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
158
159 def unop(name, ty, const_expr):
160 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
161
162 def unop_horiz(name, output_size, output_type, input_size, input_type,
163 const_expr):
164 opcode(name, output_size, output_type, [input_size], [input_type],
165 False, "", const_expr)
166
167 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
168 reduce_expr, final_expr):
169 def prereduce(src):
170 return "(" + prereduce_expr.format(src=src) + ")"
171 def final(src):
172 return final_expr.format(src="(" + src + ")")
173 def reduce_(src0, src1):
174 return reduce_expr.format(src0=src0, src1=src1)
175 src0 = prereduce("src0.x")
176 src1 = prereduce("src0.y")
177 src2 = prereduce("src0.z")
178 src3 = prereduce("src0.w")
179 unop_horiz(name + "2", output_size, output_type, 2, input_type,
180 final(reduce_(src0, src1)))
181 unop_horiz(name + "3", output_size, output_type, 3, input_type,
182 final(reduce_(reduce_(src0, src1), src2)))
183 unop_horiz(name + "4", output_size, output_type, 4, input_type,
184 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
185
186 def unop_numeric_convert(name, out_type, in_type, const_expr):
187 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
188
189 unop("mov", tuint, "src0")
190
191 unop("ineg", tint, "-src0")
192 unop("fneg", tfloat, "-src0")
193 unop("inot", tint, "~src0") # invert every bit of the integer
194 unop("fsign", tfloat, ("bit_size == 64 ? " +
195 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
196 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
197 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
198 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
199 unop("fabs", tfloat, "fabs(src0)")
200 unop("fsat", tfloat, ("bit_size == 64 ? " +
201 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
202 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
203 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
204 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
205 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
206 unop("fexp2", tfloat, "exp2f(src0)")
207 unop("flog2", tfloat, "log2f(src0)")
208
209 # Generate all of the numeric conversion opcodes
210 for src_t in [tint, tuint, tfloat, tbool]:
211 if src_t == tbool:
212 dst_types = [tfloat, tint]
213 elif src_t == tint:
214 dst_types = [tfloat, tint, tbool]
215 elif src_t == tuint:
216 dst_types = [tfloat, tuint]
217 elif src_t == tfloat:
218 dst_types = [tint, tuint, tfloat, tbool]
219
220 for dst_t in dst_types:
221 for dst_bit_size in type_sizes(dst_t):
222 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
223 rnd_modes = ['_rtne', '_rtz', '']
224 for rnd_mode in rnd_modes:
225 if rnd_mode == '_rtne':
226 conv_expr = """
227 if (bit_size > 16) {
228 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
229 } else {
230 dst = src0;
231 }
232 """
233 elif rnd_mode == '_rtz':
234 conv_expr = """
235 if (bit_size > 16) {
236 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
237 } else {
238 dst = src0;
239 }
240 """
241 else:
242 conv_expr = "src0"
243
244 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
245 dst_t[0],
246 dst_bit_size,
247 rnd_mode),
248 dst_t + str(dst_bit_size),
249 src_t, conv_expr)
250 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
251 conv_expr = """
252 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
253 dst = _mesa_double_to_float_rtz(src0);
254 } else {
255 dst = src0;
256 }
257 """
258 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
259 dst_bit_size),
260 dst_t + str(dst_bit_size), src_t, conv_expr)
261 else:
262 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
263 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
264 dst_bit_size),
265 dst_t + str(dst_bit_size), src_t, conv_expr)
266
267
268 # Unary floating-point rounding operations.
269
270
271 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
272 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
273 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
274 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
275 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
276
277 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
278
279 # Trigonometric operations.
280
281
282 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
283 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
284
285 # dfrexp
286 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
287 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
288
289 # Partial derivatives.
290
291
292 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
293 unop("fddy", tfloat, "0.0")
294 unop("fddx_fine", tfloat, "0.0")
295 unop("fddy_fine", tfloat, "0.0")
296 unop("fddx_coarse", tfloat, "0.0")
297 unop("fddy_coarse", tfloat, "0.0")
298
299
300 # Floating point pack and unpack operations.
301
302 def pack_2x16(fmt):
303 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
304 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
305 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
306 """.replace("fmt", fmt))
307
308 def pack_4x8(fmt):
309 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
310 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
311 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
313 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
314 """.replace("fmt", fmt))
315
316 def unpack_2x16(fmt):
317 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
318 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
319 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
320 """.replace("fmt", fmt))
321
322 def unpack_4x8(fmt):
323 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
324 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
325 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
326 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
327 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
328 """.replace("fmt", fmt))
329
330
331 pack_2x16("snorm")
332 pack_4x8("snorm")
333 pack_2x16("unorm")
334 pack_4x8("unorm")
335 pack_2x16("half")
336 unpack_2x16("snorm")
337 unpack_4x8("snorm")
338 unpack_2x16("unorm")
339 unpack_4x8("unorm")
340 unpack_2x16("half")
341
342 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
343 dst.x = (src0.x & 0xffff) | (src0.y << 16);
344 """)
345
346 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
347 dst.x = (src0.x << 0) |
348 (src0.y << 8) |
349 (src0.z << 16) |
350 (src0.w << 24);
351 """)
352
353 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
354 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
355
356 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
357 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
358
359 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
360 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
361
362 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
363 "dst.x = src0.x; dst.y = src0.x >> 32;")
364
365 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
366 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
367
368 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
369 "dst.x = src0.x; dst.y = src0.x >> 16;")
370
371 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
372 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
373 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
374 """)
375
376 # Lowered floating point unpacking operations.
377
378 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
379 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
380 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
381 "unpack_half_1x16((uint16_t)(src0 >> 16))")
382
383 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
384 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
385 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
386 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
387
388 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
389 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
390
391 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
392 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
393
394 # Bit operations, part of ARB_gpu_shader5.
395
396
397 unop("bitfield_reverse", tuint32, """
398 /* we're not winning any awards for speed here, but that's ok */
399 dst = 0;
400 for (unsigned bit = 0; bit < 32; bit++)
401 dst |= ((src0 >> bit) & 1) << (31 - bit);
402 """)
403 unop_convert("bit_count", tuint32, tuint, """
404 dst = 0;
405 for (unsigned bit = 0; bit < bit_size; bit++) {
406 if ((src0 >> bit) & 1)
407 dst++;
408 }
409 """)
410
411 unop_convert("ufind_msb", tint32, tuint, """
412 dst = -1;
413 for (int bit = bit_size - 1; bit >= 0; bit--) {
414 if ((src0 >> bit) & 1) {
415 dst = bit;
416 break;
417 }
418 }
419 """)
420
421 unop("ifind_msb", tint32, """
422 dst = -1;
423 for (int bit = 31; bit >= 0; bit--) {
424 /* If src0 < 0, we're looking for the first 0 bit.
425 * if src0 >= 0, we're looking for the first 1 bit.
426 */
427 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
428 (!((src0 >> bit) & 1) && (src0 < 0))) {
429 dst = bit;
430 break;
431 }
432 }
433 """)
434
435 unop_convert("find_lsb", tint32, tint, """
436 dst = -1;
437 for (unsigned bit = 0; bit < bit_size; bit++) {
438 if ((src0 >> bit) & 1) {
439 dst = bit;
440 break;
441 }
442 }
443 """)
444
445
446 for i in range(1, 5):
447 for j in range(1, 5):
448 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
449
450
451 # AMD_gcn_shader extended instructions
452 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
453 dst.x = dst.y = 0.0;
454 float absX = fabs(src0.x);
455 float absY = fabs(src0.y);
456 float absZ = fabs(src0.z);
457
458 float ma = 0.0;
459 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
460 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
461 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
462
463 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
464 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
465 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
466 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
467 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
468 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
469
470 dst.x = dst.x / ma + 0.5;
471 dst.y = dst.y / ma + 0.5;
472 """)
473
474 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
475 float absX = fabs(src0.x);
476 float absY = fabs(src0.y);
477 float absZ = fabs(src0.z);
478 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
479 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
480 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
481 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
482 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
483 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
484 """)
485
486 # Sum of vector components
487 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
488
489 def binop_convert(name, out_type, in_type, alg_props, const_expr):
490 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
491 False, alg_props, const_expr)
492
493 def binop(name, ty, alg_props, const_expr):
494 binop_convert(name, ty, ty, alg_props, const_expr)
495
496 def binop_compare(name, ty, alg_props, const_expr):
497 binop_convert(name, tbool1, ty, alg_props, const_expr)
498
499 def binop_compare16(name, ty, alg_props, const_expr):
500 binop_convert(name, tbool16, ty, alg_props, const_expr)
501
502 def binop_compare32(name, ty, alg_props, const_expr):
503 binop_convert(name, tbool32, ty, alg_props, const_expr)
504
505 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
506 binop_compare(name, ty, alg_props, const_expr)
507 binop_compare16(name + "16", ty, alg_props, const_expr)
508 binop_compare32(name + "32", ty, alg_props, const_expr)
509
510 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
511 src2_type, const_expr):
512 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
513 False, "", const_expr)
514
515 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
516 reduce_expr, final_expr):
517 def final(src):
518 return final_expr.format(src= "(" + src + ")")
519 def reduce_(src0, src1):
520 return reduce_expr.format(src0=src0, src1=src1)
521 def prereduce(src0, src1):
522 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
523 src0 = prereduce("src0.x", "src1.x")
524 src1 = prereduce("src0.y", "src1.y")
525 src2 = prereduce("src0.z", "src1.z")
526 src3 = prereduce("src0.w", "src1.w")
527 opcode(name + "2", output_size, output_type,
528 [2, 2], [src_type, src_type], False, _2src_commutative,
529 final(reduce_(src0, src1)))
530 opcode(name + "3", output_size, output_type,
531 [3, 3], [src_type, src_type], False, _2src_commutative,
532 final(reduce_(reduce_(src0, src1), src2)))
533 opcode(name + "4", output_size, output_type,
534 [4, 4], [src_type, src_type], False, _2src_commutative,
535 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
536
537 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
538 reduce_expr, final_expr):
539 binop_reduce(name, output_size, tbool1, src_type,
540 prereduce_expr, reduce_expr, final_expr)
541 binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
542 prereduce_expr, reduce_expr, final_expr)
543 binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
544 prereduce_expr, reduce_expr, final_expr)
545
546 binop("fadd", tfloat, _2src_commutative + associative,"""
547 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
548 if (bit_size == 64)
549 dst = _mesa_double_add_rtz(src0, src1);
550 else
551 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
552 } else {
553 dst = src0 + src1;
554 }
555 """)
556 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
557 binop("iadd_sat", tint, _2src_commutative, """
558 src1 > 0 ?
559 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
560 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
561 """)
562 binop("uadd_sat", tuint, _2src_commutative,
563 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
564 binop("isub_sat", tint, "", """
565 src1 < 0 ?
566 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
567 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
568 """)
569 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
570
571 binop("fsub", tfloat, "", """
572 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
573 if (bit_size == 64)
574 dst = _mesa_double_sub_rtz(src0, src1);
575 else
576 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
577 } else {
578 dst = src0 - src1;
579 }
580 """)
581 binop("isub", tint, "", "src0 - src1")
582
583 binop("fmul", tfloat, _2src_commutative + associative, """
584 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
585 if (bit_size == 64)
586 dst = _mesa_double_mul_rtz(src0, src1);
587 else
588 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
589 } else {
590 dst = src0 * src1;
591 }
592 """)
593 # low 32-bits of signed/unsigned integer multiply
594 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
595
596 # Generate 64 bit result from 2 32 bits quantity
597 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
598 "(int64_t)src0 * (int64_t)src1")
599 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
600 "(uint64_t)src0 * (uint64_t)src1")
601
602 # high 32-bits of signed integer multiply
603 binop("imul_high", tint, _2src_commutative, """
604 if (bit_size == 64) {
605 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
606 * extension to work properly. The casts are kind-of annoying but needed
607 * to prevent compiler warnings.
608 */
609 uint32_t src0_u32[4] = {
610 src0,
611 (int64_t)src0 >> 32,
612 (int64_t)src0 >> 63,
613 (int64_t)src0 >> 63,
614 };
615 uint32_t src1_u32[4] = {
616 src1,
617 (int64_t)src1 >> 32,
618 (int64_t)src1 >> 63,
619 (int64_t)src1 >> 63,
620 };
621 uint32_t prod_u32[4];
622 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
623 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
624 } else {
625 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
626 }
627 """)
628
629 # high 32-bits of unsigned integer multiply
630 binop("umul_high", tuint, _2src_commutative, """
631 if (bit_size == 64) {
632 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
633 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
634 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
635 uint32_t prod_u32[4];
636 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
637 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
638 } else {
639 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
640 }
641 """)
642
643 # low 32-bits of unsigned integer multiply
644 binop("umul_low", tuint32, _2src_commutative, """
645 uint64_t mask = (1 << (bit_size / 2)) - 1;
646 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
647 """)
648
649
650 binop("fdiv", tfloat, "", "src0 / src1")
651 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
652 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
653
654 # returns a boolean representing the carry resulting from the addition of
655 # the two unsigned arguments.
656
657 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
658
659 # returns a boolean representing the borrow resulting from the subtraction
660 # of the two unsigned arguments.
661
662 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
663
664 # hadd: (a + b) >> 1 (without overflow)
665 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
666 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
667 # = 2 * (x & y) + (x & ~y) + (~x & y)
668 # = ((x & y) << 1) + (x ^ y)
669 #
670 # Since we know that the bottom bit of (x & y) << 1 is zero,
671 #
672 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
673 # = (x & y) + ((x ^ y) >> 1)
674 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
675 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
676
677 # rhadd: (a + b + 1) >> 1 (without overflow)
678 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
679 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
680 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
681 # = ((x | y) << 1) - (x ^ y) + 1
682 #
683 # Since we know that the bottom bit of (x & y) << 1 is zero,
684 #
685 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
686 # = (x | y) - ((x ^ y) >> 1)
687 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
688 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
689
690 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
691
692 # For signed integers, there are several different possible definitions of
693 # "modulus" or "remainder". We follow the conventions used by LLVM and
694 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
695 # operation while the imod opcode implements the more mathematical
696 # "modulus" operation. For details on the difference, see
697 #
698 # http://mathforum.org/library/drmath/view/52343.html
699
700 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
701 binop("imod", tint, "",
702 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
703 " src0 % src1 : src0 % src1 + src1)")
704 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
705 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
706
707 #
708 # Comparisons
709 #
710
711
712 # these integer-aware comparisons return a boolean (0 or ~0)
713
714 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
715 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
716 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
717 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
718 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
719 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
720 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
721 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
722 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
723 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
724
725 # integer-aware GLSL-style comparisons that compare floats and ints
726
727 binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}",
728 "{src0} && {src1}", "{src}")
729 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
730 "{src0} || {src1}", "{src}")
731 binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}",
732 "{src0} && {src1}", "{src}")
733 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
734 "{src0} || {src1}", "{src}")
735
736 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
737
738 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
739 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
740 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
741 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
742
743 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
744 # and false respectively
745
746 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
747 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
748 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
749 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
750
751 # SPIRV shifts are undefined for shift-operands >= bitsize,
752 # but SM5 shifts are defined to use the least significant bits, only
753 # The NIR definition is according to the SM5 specification.
754 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
755 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
756 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
757 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
758 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
759 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
760
761 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
762 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
763 dst = (src0 << (src1 & rotate_mask)) |
764 (src0 >> (-src1 & rotate_mask));
765 """)
766 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
767 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
768 dst = (src0 >> (src1 & rotate_mask)) |
769 (src0 << (-src1 & rotate_mask));
770 """)
771
772 # bitwise logic operators
773 #
774 # These are also used as boolean and, or, xor for hardware supporting
775 # integers.
776
777
778 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
779 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
780 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
781
782
783 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
784 "{src}")
785
786 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
787 "{src0} * {src1}", "{src0} + {src1}", "{src}")
788
789 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
790 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
791 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
792 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
793
794 binop("fmin", tfloat, "", "fmin(src0, src1)")
795 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
796 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
797 binop("fmax", tfloat, "", "fmax(src0, src1)")
798 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
799 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
800
801 # Saturated vector add for 4 8bit ints.
802 binop("usadd_4x8", tint32, _2src_commutative + associative, """
803 dst = 0;
804 for (int i = 0; i < 32; i += 8) {
805 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
806 }
807 """)
808
809 # Saturated vector subtract for 4 8bit ints.
810 binop("ussub_4x8", tint32, "", """
811 dst = 0;
812 for (int i = 0; i < 32; i += 8) {
813 int src0_chan = (src0 >> i) & 0xff;
814 int src1_chan = (src1 >> i) & 0xff;
815 if (src0_chan > src1_chan)
816 dst |= (src0_chan - src1_chan) << i;
817 }
818 """)
819
820 # vector min for 4 8bit ints.
821 binop("umin_4x8", tint32, _2src_commutative + associative, """
822 dst = 0;
823 for (int i = 0; i < 32; i += 8) {
824 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
825 }
826 """)
827
828 # vector max for 4 8bit ints.
829 binop("umax_4x8", tint32, _2src_commutative + associative, """
830 dst = 0;
831 for (int i = 0; i < 32; i += 8) {
832 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
833 }
834 """)
835
836 # unorm multiply: (a * b) / 255.
837 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
838 dst = 0;
839 for (int i = 0; i < 32; i += 8) {
840 int src0_chan = (src0 >> i) & 0xff;
841 int src1_chan = (src1 >> i) & 0xff;
842 dst |= ((src0_chan * src1_chan) / 255) << i;
843 }
844 """)
845
846 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
847
848 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
849 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
850
851 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
852 "src0 | ((uint64_t)src1 << 32)")
853
854 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
855 "src0 | ((uint32_t)src1 << 16)")
856
857 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
858 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
859 # are from the low five bits of src0 and src1, respectively.
860 binop_convert("bfm", tuint32, tint32, "", """
861 int bits = src0 & 0x1F;
862 int offset = src1 & 0x1F;
863 dst = ((1u << bits) - 1) << offset;
864 """)
865
866 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
867 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
868 /* flush denormals to zero. */
869 if (!isnormal(dst))
870 dst = copysignf(0.0f, src0);
871 """)
872
873 # Combines the first component of each input to make a 2-component vector.
874
875 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
876 dst.x = src0.x;
877 dst.y = src1.x;
878 """)
879
880 # Byte extraction
881 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
882 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
883
884 # Word extraction
885 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
886 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
887
888
889 def triop(name, ty, alg_props, const_expr):
890 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
891 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
892 opcode(name, output_size, tuint,
893 [src1_size, src2_size, src3_size],
894 [tuint, tuint, tuint], False, "", const_expr)
895
896 triop("ffma", tfloat, _2src_commutative, """
897 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
898 if (bit_size == 64)
899 dst = _mesa_double_fma_rtz(src0, src1, src2);
900 else if (bit_size == 32)
901 dst = _mesa_float_fma_rtz(src0, src1, src2);
902 else
903 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
904 } else {
905 if (bit_size == 32)
906 dst = fmaf(src0, src1, src2);
907 else
908 dst = fma(src0, src1, src2);
909 }
910 """)
911
912 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
913
914 # Conditional Select
915 #
916 # A vector conditional select instruction (like ?:, but operating per-
917 # component on vectors). There are two versions, one for floating point
918 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
919
920
921 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
922
923 # 3 way min/max/med
924 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
925 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
926 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
927
928 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
929 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
930 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
931
932 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
933 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
934 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
935
936 opcode("bcsel", 0, tuint, [0, 0, 0],
937 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
938 opcode("b16csel", 0, tuint, [0, 0, 0],
939 [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
940 opcode("b32csel", 0, tuint, [0, 0, 0],
941 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
942
943 # SM5 bfi assembly
944 triop("bfi", tuint32, "", """
945 unsigned mask = src0, insert = src1, base = src2;
946 if (mask == 0) {
947 dst = base;
948 } else {
949 unsigned tmp = mask;
950 while (!(tmp & 1)) {
951 tmp >>= 1;
952 insert <<= 1;
953 }
954 dst = (base & ~mask) | (insert & mask);
955 }
956 """)
957
958
959 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
960
961 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
962 opcode("ubfe", 0, tuint32,
963 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
964 unsigned base = src0;
965 unsigned offset = src1 & 0x1F;
966 unsigned bits = src2 & 0x1F;
967 if (bits == 0) {
968 dst = 0;
969 } else if (offset + bits < 32) {
970 dst = (base << (32 - bits - offset)) >> (32 - bits);
971 } else {
972 dst = base >> offset;
973 }
974 """)
975 opcode("ibfe", 0, tint32,
976 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
977 int base = src0;
978 unsigned offset = src1 & 0x1F;
979 unsigned bits = src2 & 0x1F;
980 if (bits == 0) {
981 dst = 0;
982 } else if (offset + bits < 32) {
983 dst = (base << (32 - bits - offset)) >> (32 - bits);
984 } else {
985 dst = base >> offset;
986 }
987 """)
988
989 # GLSL bitfieldExtract()
990 opcode("ubitfield_extract", 0, tuint32,
991 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
992 unsigned base = src0;
993 int offset = src1, bits = src2;
994 if (bits == 0) {
995 dst = 0;
996 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
997 dst = 0; /* undefined per the spec */
998 } else {
999 dst = (base >> offset) & ((1ull << bits) - 1);
1000 }
1001 """)
1002 opcode("ibitfield_extract", 0, tint32,
1003 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1004 int base = src0;
1005 int offset = src1, bits = src2;
1006 if (bits == 0) {
1007 dst = 0;
1008 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1009 dst = 0;
1010 } else {
1011 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1012 }
1013 """)
1014
1015 # Combines the first component of each input to make a 3-component vector.
1016
1017 triop_horiz("vec3", 3, 1, 1, 1, """
1018 dst.x = src0.x;
1019 dst.y = src1.x;
1020 dst.z = src2.x;
1021 """)
1022
1023 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1024 src4_size, const_expr):
1025 opcode(name, output_size, tuint,
1026 [src1_size, src2_size, src3_size, src4_size],
1027 [tuint, tuint, tuint, tuint],
1028 False, "", const_expr)
1029
1030 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1031 [tuint32, tuint32, tint32, tint32], False, "", """
1032 unsigned base = src0, insert = src1;
1033 int offset = src2, bits = src3;
1034 if (bits == 0) {
1035 dst = base;
1036 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1037 dst = 0;
1038 } else {
1039 unsigned mask = ((1ull << bits) - 1) << offset;
1040 dst = (base & ~mask) | ((insert << offset) & mask);
1041 }
1042 """)
1043
1044 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1045 dst.x = src0.x;
1046 dst.y = src1.x;
1047 dst.z = src2.x;
1048 dst.w = src3.x;
1049 """)
1050
1051 # An integer multiply instruction for address calculation. This is
1052 # similar to imul, except that the results are undefined in case of
1053 # overflow. Overflow is defined according to the size of the variable
1054 # being dereferenced.
1055 #
1056 # This relaxed definition, compared to imul, allows an optimization
1057 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1058 # sources, such that lower precision integer multiplies can be used.
1059 # This is useful on hw that has 24b or perhaps 16b integer multiply
1060 # instructions.
1061 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1062
1063 # ir3-specific instruction that maps directly to mul-add shift high mix,
1064 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1065 # multiplication (imul) on Freedreno backend..
1066 opcode("imadsh_mix16", 1, tint32,
1067 [1, 1, 1], [tint32, tint32, tint32], False, "", """
1068 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1069 """)
1070
1071 # ir3-specific instruction that maps directly to ir3 mad.s24.
1072 #
1073 # 24b multiply into 32b result (with sign extension) plus 32b int
1074 triop("imad24_ir3", tint32, _2src_commutative,
1075 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1076
1077 # 24b multiply into 32b result (with sign extension)
1078 binop("imul24", tint32, _2src_commutative + associative,
1079 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")