nir: Add a new memory_barrier_tcs_patch intrinsic
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool8 = "bool8"
99 tbool16 = "bool16"
100 tbool32 = "bool32"
101 tuint = "uint"
102 tuint16 = "uint16"
103 tfloat32 = "float32"
104 tint32 = "int32"
105 tuint32 = "uint32"
106 tint64 = "int64"
107 tuint64 = "uint64"
108 tfloat64 = "float64"
109
110 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
111
112 def type_has_size(type_):
113 m = _TYPE_SPLIT_RE.match(type_)
114 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
115 return m.group('bits') is not None
116
117 def type_size(type_):
118 m = _TYPE_SPLIT_RE.match(type_)
119 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
120 assert m.group('bits') is not None, \
121 'NIR type string has no bit size: "{}"'.format(type_)
122 return int(m.group('bits'))
123
124 def type_sizes(type_):
125 if type_has_size(type_):
126 return [type_size(type_)]
127 elif type_ == 'bool':
128 return [1, 8, 16, 32]
129 elif type_ == 'float':
130 return [16, 32, 64]
131 else:
132 return [1, 8, 16, 32, 64]
133
134 def type_base_type(type_):
135 m = _TYPE_SPLIT_RE.match(type_)
136 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
137 return m.group('type')
138
139 # Operation where the first two sources are commutative.
140 #
141 # For 2-source operations, this just mathematical commutativity. Some
142 # 3-source operations, like ffma, are only commutative in the first two
143 # sources.
144 _2src_commutative = "2src_commutative "
145 associative = "associative "
146
147 # global dictionary of opcodes
148 opcodes = {}
149
150 def opcode(name, output_size, output_type, input_sizes, input_types,
151 is_conversion, algebraic_properties, const_expr):
152 assert name not in opcodes
153 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
154 input_types, is_conversion, algebraic_properties,
155 const_expr)
156
157 def unop_convert(name, out_type, in_type, const_expr):
158 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
159
160 def unop(name, ty, const_expr):
161 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
162
163 def unop_horiz(name, output_size, output_type, input_size, input_type,
164 const_expr):
165 opcode(name, output_size, output_type, [input_size], [input_type],
166 False, "", const_expr)
167
168 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
169 reduce_expr, final_expr):
170 def prereduce(src):
171 return "(" + prereduce_expr.format(src=src) + ")"
172 def final(src):
173 return final_expr.format(src="(" + src + ")")
174 def reduce_(src0, src1):
175 return reduce_expr.format(src0=src0, src1=src1)
176 src0 = prereduce("src0.x")
177 src1 = prereduce("src0.y")
178 src2 = prereduce("src0.z")
179 src3 = prereduce("src0.w")
180 unop_horiz(name + "2", output_size, output_type, 2, input_type,
181 final(reduce_(src0, src1)))
182 unop_horiz(name + "3", output_size, output_type, 3, input_type,
183 final(reduce_(reduce_(src0, src1), src2)))
184 unop_horiz(name + "4", output_size, output_type, 4, input_type,
185 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
186
187 def unop_numeric_convert(name, out_type, in_type, const_expr):
188 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
189
190 unop("mov", tuint, "src0")
191
192 unop("ineg", tint, "-src0")
193 unop("fneg", tfloat, "-src0")
194 unop("inot", tint, "~src0") # invert every bit of the integer
195 unop("fsign", tfloat, ("bit_size == 64 ? " +
196 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
197 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
198 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
199 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
200 unop("fabs", tfloat, "fabs(src0)")
201 unop("fsat", tfloat, ("bit_size == 64 ? " +
202 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
203 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
204 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
205 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
206 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
207 unop("fexp2", tfloat, "exp2f(src0)")
208 unop("flog2", tfloat, "log2f(src0)")
209
210 # Generate all of the numeric conversion opcodes
211 for src_t in [tint, tuint, tfloat, tbool]:
212 if src_t == tbool:
213 dst_types = [tfloat, tint]
214 elif src_t == tint:
215 dst_types = [tfloat, tint, tbool]
216 elif src_t == tuint:
217 dst_types = [tfloat, tuint]
218 elif src_t == tfloat:
219 dst_types = [tint, tuint, tfloat, tbool]
220
221 for dst_t in dst_types:
222 for dst_bit_size in type_sizes(dst_t):
223 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
224 rnd_modes = ['_rtne', '_rtz', '']
225 for rnd_mode in rnd_modes:
226 if rnd_mode == '_rtne':
227 conv_expr = """
228 if (bit_size > 16) {
229 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
230 } else {
231 dst = src0;
232 }
233 """
234 elif rnd_mode == '_rtz':
235 conv_expr = """
236 if (bit_size > 16) {
237 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
238 } else {
239 dst = src0;
240 }
241 """
242 else:
243 conv_expr = "src0"
244
245 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
246 dst_t[0],
247 dst_bit_size,
248 rnd_mode),
249 dst_t + str(dst_bit_size),
250 src_t, conv_expr)
251 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
252 conv_expr = """
253 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
254 dst = _mesa_double_to_float_rtz(src0);
255 } else {
256 dst = src0;
257 }
258 """
259 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
260 dst_bit_size),
261 dst_t + str(dst_bit_size), src_t, conv_expr)
262 else:
263 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
264 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
265 dst_bit_size),
266 dst_t + str(dst_bit_size), src_t, conv_expr)
267
268
269 # Unary floating-point rounding operations.
270
271
272 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
273 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
274 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
275 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
276 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
277
278 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
279
280 # Trigonometric operations.
281
282
283 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
284 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
285
286 # dfrexp
287 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
288 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
289
290 # Partial derivatives.
291
292
293 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
294 unop("fddy", tfloat, "0.0")
295 unop("fddx_fine", tfloat, "0.0")
296 unop("fddy_fine", tfloat, "0.0")
297 unop("fddx_coarse", tfloat, "0.0")
298 unop("fddy_coarse", tfloat, "0.0")
299
300
301 # Floating point pack and unpack operations.
302
303 def pack_2x16(fmt):
304 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
305 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
306 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
307 """.replace("fmt", fmt))
308
309 def pack_4x8(fmt):
310 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
311 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
313 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
314 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
315 """.replace("fmt", fmt))
316
317 def unpack_2x16(fmt):
318 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
319 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
320 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
321 """.replace("fmt", fmt))
322
323 def unpack_4x8(fmt):
324 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
325 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
326 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
327 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
328 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
329 """.replace("fmt", fmt))
330
331
332 pack_2x16("snorm")
333 pack_4x8("snorm")
334 pack_2x16("unorm")
335 pack_4x8("unorm")
336 pack_2x16("half")
337 unpack_2x16("snorm")
338 unpack_4x8("snorm")
339 unpack_2x16("unorm")
340 unpack_4x8("unorm")
341 unpack_2x16("half")
342
343 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
344 dst.x = (src0.x & 0xffff) | (src0.y << 16);
345 """)
346
347 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
348 dst.x = (src0.x << 0) |
349 (src0.y << 8) |
350 (src0.z << 16) |
351 (src0.w << 24);
352 """)
353
354 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
355 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
356
357 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
358 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
359
360 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
361 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
362
363 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
364 "dst.x = src0.x; dst.y = src0.x >> 32;")
365
366 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
367 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
368
369 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
370 "dst.x = src0.x; dst.y = src0.x >> 16;")
371
372 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
373 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
374 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
375 """)
376
377 # Lowered floating point unpacking operations.
378
379 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
380 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
381 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
382 "unpack_half_1x16((uint16_t)(src0 >> 16))")
383
384 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
385 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
386 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
387 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
388
389 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
390 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
391
392 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
393 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
394
395 # Bit operations, part of ARB_gpu_shader5.
396
397
398 unop("bitfield_reverse", tuint32, """
399 /* we're not winning any awards for speed here, but that's ok */
400 dst = 0;
401 for (unsigned bit = 0; bit < 32; bit++)
402 dst |= ((src0 >> bit) & 1) << (31 - bit);
403 """)
404 unop_convert("bit_count", tuint32, tuint, """
405 dst = 0;
406 for (unsigned bit = 0; bit < bit_size; bit++) {
407 if ((src0 >> bit) & 1)
408 dst++;
409 }
410 """)
411
412 unop_convert("ufind_msb", tint32, tuint, """
413 dst = -1;
414 for (int bit = bit_size - 1; bit >= 0; bit--) {
415 if ((src0 >> bit) & 1) {
416 dst = bit;
417 break;
418 }
419 }
420 """)
421
422 unop("ifind_msb", tint32, """
423 dst = -1;
424 for (int bit = 31; bit >= 0; bit--) {
425 /* If src0 < 0, we're looking for the first 0 bit.
426 * if src0 >= 0, we're looking for the first 1 bit.
427 */
428 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
429 (!((src0 >> bit) & 1) && (src0 < 0))) {
430 dst = bit;
431 break;
432 }
433 }
434 """)
435
436 unop_convert("find_lsb", tint32, tint, """
437 dst = -1;
438 for (unsigned bit = 0; bit < bit_size; bit++) {
439 if ((src0 >> bit) & 1) {
440 dst = bit;
441 break;
442 }
443 }
444 """)
445
446
447 for i in range(1, 5):
448 for j in range(1, 5):
449 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
450
451
452 # AMD_gcn_shader extended instructions
453 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
454 dst.x = dst.y = 0.0;
455 float absX = fabs(src0.x);
456 float absY = fabs(src0.y);
457 float absZ = fabs(src0.z);
458
459 float ma = 0.0;
460 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
461 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
462 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
463
464 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
465 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
466 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
467 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
468 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
469 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
470
471 dst.x = dst.x / ma + 0.5;
472 dst.y = dst.y / ma + 0.5;
473 """)
474
475 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
476 float absX = fabs(src0.x);
477 float absY = fabs(src0.y);
478 float absZ = fabs(src0.z);
479 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
480 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
481 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
482 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
483 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
484 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
485 """)
486
487 # Sum of vector components
488 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
489
490 def binop_convert(name, out_type, in_type, alg_props, const_expr):
491 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
492 False, alg_props, const_expr)
493
494 def binop(name, ty, alg_props, const_expr):
495 binop_convert(name, ty, ty, alg_props, const_expr)
496
497 def binop_compare(name, ty, alg_props, const_expr):
498 binop_convert(name, tbool1, ty, alg_props, const_expr)
499
500 def binop_compare8(name, ty, alg_props, const_expr):
501 binop_convert(name, tbool8, ty, alg_props, const_expr)
502
503 def binop_compare16(name, ty, alg_props, const_expr):
504 binop_convert(name, tbool16, ty, alg_props, const_expr)
505
506 def binop_compare32(name, ty, alg_props, const_expr):
507 binop_convert(name, tbool32, ty, alg_props, const_expr)
508
509 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
510 binop_compare(name, ty, alg_props, const_expr)
511 binop_compare8(name + "8", ty, alg_props, const_expr)
512 binop_compare16(name + "16", ty, alg_props, const_expr)
513 binop_compare32(name + "32", ty, alg_props, const_expr)
514
515 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
516 src2_type, const_expr):
517 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
518 False, "", const_expr)
519
520 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
521 reduce_expr, final_expr):
522 def final(src):
523 return final_expr.format(src= "(" + src + ")")
524 def reduce_(src0, src1):
525 return reduce_expr.format(src0=src0, src1=src1)
526 def prereduce(src0, src1):
527 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
528 src0 = prereduce("src0.x", "src1.x")
529 src1 = prereduce("src0.y", "src1.y")
530 src2 = prereduce("src0.z", "src1.z")
531 src3 = prereduce("src0.w", "src1.w")
532 opcode(name + "2", output_size, output_type,
533 [2, 2], [src_type, src_type], False, _2src_commutative,
534 final(reduce_(src0, src1)))
535 opcode(name + "3", output_size, output_type,
536 [3, 3], [src_type, src_type], False, _2src_commutative,
537 final(reduce_(reduce_(src0, src1), src2)))
538 opcode(name + "4", output_size, output_type,
539 [4, 4], [src_type, src_type], False, _2src_commutative,
540 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
541
542 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
543 reduce_expr, final_expr):
544 binop_reduce(name, output_size, tbool1, src_type,
545 prereduce_expr, reduce_expr, final_expr)
546 binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
547 prereduce_expr, reduce_expr, final_expr)
548 binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
549 prereduce_expr, reduce_expr, final_expr)
550 binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
551 prereduce_expr, reduce_expr, final_expr)
552
553 binop("fadd", tfloat, _2src_commutative + associative,"""
554 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
555 if (bit_size == 64)
556 dst = _mesa_double_add_rtz(src0, src1);
557 else
558 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
559 } else {
560 dst = src0 + src1;
561 }
562 """)
563 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
564 binop("iadd_sat", tint, _2src_commutative, """
565 src1 > 0 ?
566 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
567 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
568 """)
569 binop("uadd_sat", tuint, _2src_commutative,
570 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
571 binop("isub_sat", tint, "", """
572 src1 < 0 ?
573 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
574 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
575 """)
576 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
577
578 binop("fsub", tfloat, "", """
579 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
580 if (bit_size == 64)
581 dst = _mesa_double_sub_rtz(src0, src1);
582 else
583 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
584 } else {
585 dst = src0 - src1;
586 }
587 """)
588 binop("isub", tint, "", "src0 - src1")
589
590 binop("fmul", tfloat, _2src_commutative + associative, """
591 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
592 if (bit_size == 64)
593 dst = _mesa_double_mul_rtz(src0, src1);
594 else
595 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
596 } else {
597 dst = src0 * src1;
598 }
599 """)
600 # low 32-bits of signed/unsigned integer multiply
601 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
602
603 # Generate 64 bit result from 2 32 bits quantity
604 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
605 "(int64_t)src0 * (int64_t)src1")
606 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
607 "(uint64_t)src0 * (uint64_t)src1")
608
609 # high 32-bits of signed integer multiply
610 binop("imul_high", tint, _2src_commutative, """
611 if (bit_size == 64) {
612 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
613 * extension to work properly. The casts are kind-of annoying but needed
614 * to prevent compiler warnings.
615 */
616 uint32_t src0_u32[4] = {
617 src0,
618 (int64_t)src0 >> 32,
619 (int64_t)src0 >> 63,
620 (int64_t)src0 >> 63,
621 };
622 uint32_t src1_u32[4] = {
623 src1,
624 (int64_t)src1 >> 32,
625 (int64_t)src1 >> 63,
626 (int64_t)src1 >> 63,
627 };
628 uint32_t prod_u32[4];
629 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
630 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
631 } else {
632 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
633 }
634 """)
635
636 # high 32-bits of unsigned integer multiply
637 binop("umul_high", tuint, _2src_commutative, """
638 if (bit_size == 64) {
639 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
640 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
641 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
642 uint32_t prod_u32[4];
643 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
644 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
645 } else {
646 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
647 }
648 """)
649
650 # low 32-bits of unsigned integer multiply
651 binop("umul_low", tuint32, _2src_commutative, """
652 uint64_t mask = (1 << (bit_size / 2)) - 1;
653 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
654 """)
655
656
657 binop("fdiv", tfloat, "", "src0 / src1")
658 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
659 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
660
661 # returns a boolean representing the carry resulting from the addition of
662 # the two unsigned arguments.
663
664 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
665
666 # returns a boolean representing the borrow resulting from the subtraction
667 # of the two unsigned arguments.
668
669 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
670
671 # hadd: (a + b) >> 1 (without overflow)
672 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
673 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
674 # = 2 * (x & y) + (x & ~y) + (~x & y)
675 # = ((x & y) << 1) + (x ^ y)
676 #
677 # Since we know that the bottom bit of (x & y) << 1 is zero,
678 #
679 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
680 # = (x & y) + ((x ^ y) >> 1)
681 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
682 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
683
684 # rhadd: (a + b + 1) >> 1 (without overflow)
685 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
686 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
687 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
688 # = ((x | y) << 1) - (x ^ y) + 1
689 #
690 # Since we know that the bottom bit of (x & y) << 1 is zero,
691 #
692 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
693 # = (x | y) - ((x ^ y) >> 1)
694 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
695 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
696
697 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
698
699 # For signed integers, there are several different possible definitions of
700 # "modulus" or "remainder". We follow the conventions used by LLVM and
701 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
702 # operation while the imod opcode implements the more mathematical
703 # "modulus" operation. For details on the difference, see
704 #
705 # http://mathforum.org/library/drmath/view/52343.html
706
707 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
708 binop("imod", tint, "",
709 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
710 " src0 % src1 : src0 % src1 + src1)")
711 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
712 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
713
714 #
715 # Comparisons
716 #
717
718
719 # these integer-aware comparisons return a boolean (0 or ~0)
720
721 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
722 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
723 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
724 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
725 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
726 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
727 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
728 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
729 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
730 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
731
732 # integer-aware GLSL-style comparisons that compare floats and ints
733
734 binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}",
735 "{src0} && {src1}", "{src}")
736 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
737 "{src0} || {src1}", "{src}")
738 binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}",
739 "{src0} && {src1}", "{src}")
740 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
741 "{src0} || {src1}", "{src}")
742
743 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
744
745 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
746 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
747 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
748 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
749
750 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
751 # and false respectively
752
753 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
754 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
755 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
756 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
757
758 # SPIRV shifts are undefined for shift-operands >= bitsize,
759 # but SM5 shifts are defined to use the least significant bits, only
760 # The NIR definition is according to the SM5 specification.
761 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
762 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
763 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
764 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
765 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
766 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
767
768 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
769 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
770 dst = (src0 << (src1 & rotate_mask)) |
771 (src0 >> (-src1 & rotate_mask));
772 """)
773 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
774 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
775 dst = (src0 >> (src1 & rotate_mask)) |
776 (src0 << (-src1 & rotate_mask));
777 """)
778
779 # bitwise logic operators
780 #
781 # These are also used as boolean and, or, xor for hardware supporting
782 # integers.
783
784
785 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
786 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
787 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
788
789
790 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
791 "{src}")
792
793 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
794 "{src0} * {src1}", "{src0} + {src1}", "{src}")
795
796 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
797 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
798 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
799 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
800
801 binop("fmin", tfloat, "", "fmin(src0, src1)")
802 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
803 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
804 binop("fmax", tfloat, "", "fmax(src0, src1)")
805 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
806 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
807
808 # Saturated vector add for 4 8bit ints.
809 binop("usadd_4x8", tint32, _2src_commutative + associative, """
810 dst = 0;
811 for (int i = 0; i < 32; i += 8) {
812 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
813 }
814 """)
815
816 # Saturated vector subtract for 4 8bit ints.
817 binop("ussub_4x8", tint32, "", """
818 dst = 0;
819 for (int i = 0; i < 32; i += 8) {
820 int src0_chan = (src0 >> i) & 0xff;
821 int src1_chan = (src1 >> i) & 0xff;
822 if (src0_chan > src1_chan)
823 dst |= (src0_chan - src1_chan) << i;
824 }
825 """)
826
827 # vector min for 4 8bit ints.
828 binop("umin_4x8", tint32, _2src_commutative + associative, """
829 dst = 0;
830 for (int i = 0; i < 32; i += 8) {
831 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
832 }
833 """)
834
835 # vector max for 4 8bit ints.
836 binop("umax_4x8", tint32, _2src_commutative + associative, """
837 dst = 0;
838 for (int i = 0; i < 32; i += 8) {
839 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
840 }
841 """)
842
843 # unorm multiply: (a * b) / 255.
844 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
845 dst = 0;
846 for (int i = 0; i < 32; i += 8) {
847 int src0_chan = (src0 >> i) & 0xff;
848 int src1_chan = (src1 >> i) & 0xff;
849 dst |= ((src0_chan * src1_chan) / 255) << i;
850 }
851 """)
852
853 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
854
855 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
856 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
857
858 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
859 "src0 | ((uint64_t)src1 << 32)")
860
861 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
862 "src0 | ((uint32_t)src1 << 16)")
863
864 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
865 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
866 # are from the low five bits of src0 and src1, respectively.
867 binop_convert("bfm", tuint32, tint32, "", """
868 int bits = src0 & 0x1F;
869 int offset = src1 & 0x1F;
870 dst = ((1u << bits) - 1) << offset;
871 """)
872
873 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
874 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
875 /* flush denormals to zero. */
876 if (!isnormal(dst))
877 dst = copysignf(0.0f, src0);
878 """)
879
880 # Combines the first component of each input to make a 2-component vector.
881
882 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
883 dst.x = src0.x;
884 dst.y = src1.x;
885 """)
886
887 # Byte extraction
888 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
889 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
890
891 # Word extraction
892 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
893 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
894
895
896 def triop(name, ty, alg_props, const_expr):
897 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
898 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
899 opcode(name, output_size, tuint,
900 [src1_size, src2_size, src3_size],
901 [tuint, tuint, tuint], False, "", const_expr)
902
903 triop("ffma", tfloat, _2src_commutative, """
904 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
905 if (bit_size == 64)
906 dst = _mesa_double_fma_rtz(src0, src1, src2);
907 else if (bit_size == 32)
908 dst = _mesa_float_fma_rtz(src0, src1, src2);
909 else
910 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
911 } else {
912 if (bit_size == 32)
913 dst = fmaf(src0, src1, src2);
914 else
915 dst = fma(src0, src1, src2);
916 }
917 """)
918
919 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
920
921 # Conditional Select
922 #
923 # A vector conditional select instruction (like ?:, but operating per-
924 # component on vectors). There are two versions, one for floating point
925 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
926
927
928 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
929
930 # 3 way min/max/med
931 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
932 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
933 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
934
935 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
936 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
937 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
938
939 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
940 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
941 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
942
943 opcode("bcsel", 0, tuint, [0, 0, 0],
944 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
945 opcode("b8csel", 0, tuint, [0, 0, 0],
946 [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
947 opcode("b16csel", 0, tuint, [0, 0, 0],
948 [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
949 opcode("b32csel", 0, tuint, [0, 0, 0],
950 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
951
952 # SM5 bfi assembly
953 triop("bfi", tuint32, "", """
954 unsigned mask = src0, insert = src1, base = src2;
955 if (mask == 0) {
956 dst = base;
957 } else {
958 unsigned tmp = mask;
959 while (!(tmp & 1)) {
960 tmp >>= 1;
961 insert <<= 1;
962 }
963 dst = (base & ~mask) | (insert & mask);
964 }
965 """)
966
967
968 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
969
970 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
971 opcode("ubfe", 0, tuint32,
972 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
973 unsigned base = src0;
974 unsigned offset = src1 & 0x1F;
975 unsigned bits = src2 & 0x1F;
976 if (bits == 0) {
977 dst = 0;
978 } else if (offset + bits < 32) {
979 dst = (base << (32 - bits - offset)) >> (32 - bits);
980 } else {
981 dst = base >> offset;
982 }
983 """)
984 opcode("ibfe", 0, tint32,
985 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
986 int base = src0;
987 unsigned offset = src1 & 0x1F;
988 unsigned bits = src2 & 0x1F;
989 if (bits == 0) {
990 dst = 0;
991 } else if (offset + bits < 32) {
992 dst = (base << (32 - bits - offset)) >> (32 - bits);
993 } else {
994 dst = base >> offset;
995 }
996 """)
997
998 # GLSL bitfieldExtract()
999 opcode("ubitfield_extract", 0, tuint32,
1000 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1001 unsigned base = src0;
1002 int offset = src1, bits = src2;
1003 if (bits == 0) {
1004 dst = 0;
1005 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1006 dst = 0; /* undefined per the spec */
1007 } else {
1008 dst = (base >> offset) & ((1ull << bits) - 1);
1009 }
1010 """)
1011 opcode("ibitfield_extract", 0, tint32,
1012 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1013 int base = src0;
1014 int offset = src1, bits = src2;
1015 if (bits == 0) {
1016 dst = 0;
1017 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1018 dst = 0;
1019 } else {
1020 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1021 }
1022 """)
1023
1024 # Combines the first component of each input to make a 3-component vector.
1025
1026 triop_horiz("vec3", 3, 1, 1, 1, """
1027 dst.x = src0.x;
1028 dst.y = src1.x;
1029 dst.z = src2.x;
1030 """)
1031
1032 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1033 src4_size, const_expr):
1034 opcode(name, output_size, tuint,
1035 [src1_size, src2_size, src3_size, src4_size],
1036 [tuint, tuint, tuint, tuint],
1037 False, "", const_expr)
1038
1039 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1040 [tuint32, tuint32, tint32, tint32], False, "", """
1041 unsigned base = src0, insert = src1;
1042 int offset = src2, bits = src3;
1043 if (bits == 0) {
1044 dst = base;
1045 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1046 dst = 0;
1047 } else {
1048 unsigned mask = ((1ull << bits) - 1) << offset;
1049 dst = (base & ~mask) | ((insert << offset) & mask);
1050 }
1051 """)
1052
1053 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1054 dst.x = src0.x;
1055 dst.y = src1.x;
1056 dst.z = src2.x;
1057 dst.w = src3.x;
1058 """)
1059
1060 opcode("vec8", 8, tuint,
1061 [1] * 8, [tuint] * 8,
1062 False, "", """
1063 dst.x = src0.x;
1064 dst.y = src1.x;
1065 dst.z = src2.x;
1066 dst.w = src3.x;
1067 dst.e = src4.x;
1068 dst.f = src5.x;
1069 dst.g = src6.x;
1070 dst.h = src7.x;
1071 """)
1072
1073 opcode("vec16", 16, tuint,
1074 [1] * 16, [tuint] * 16,
1075 False, "", """
1076 dst.x = src0.x;
1077 dst.y = src1.x;
1078 dst.z = src2.x;
1079 dst.w = src3.x;
1080 dst.e = src4.x;
1081 dst.f = src5.x;
1082 dst.g = src6.x;
1083 dst.h = src7.x;
1084 dst.i = src8.x;
1085 dst.j = src9.x;
1086 dst.k = src10.x;
1087 dst.l = src11.x;
1088 dst.m = src12.x;
1089 dst.n = src13.x;
1090 dst.o = src14.x;
1091 dst.p = src15.x;
1092 """)
1093
1094 # An integer multiply instruction for address calculation. This is
1095 # similar to imul, except that the results are undefined in case of
1096 # overflow. Overflow is defined according to the size of the variable
1097 # being dereferenced.
1098 #
1099 # This relaxed definition, compared to imul, allows an optimization
1100 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1101 # sources, such that lower precision integer multiplies can be used.
1102 # This is useful on hw that has 24b or perhaps 16b integer multiply
1103 # instructions.
1104 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1105
1106 # ir3-specific instruction that maps directly to mul-add shift high mix,
1107 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1108 # multiplication (imul) on Freedreno backend..
1109 opcode("imadsh_mix16", 1, tint32,
1110 [1, 1, 1], [tint32, tint32, tint32], False, "", """
1111 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1112 """)
1113
1114 # ir3-specific instruction that maps directly to ir3 mad.s24.
1115 #
1116 # 24b multiply into 32b result (with sign extension) plus 32b int
1117 triop("imad24_ir3", tint32, _2src_commutative,
1118 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1119
1120 # 24b multiply into 32b result (with sign extension)
1121 binop("imul24", tint32, _2src_commutative + associative,
1122 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")