nir: Add b2b opcodes
[mesa.git] / src / compiler / nir / nir_opcodes.py
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 # Connor Abbott (cwabbott0@gmail.com)
25
26 import re
27
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30
31 class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr):
37 """Parameters:
38
39 - name is the name of the opcode (prepend nir_op_ for the enum name)
40 - all types are strings that get nir_type_ prepended to them
41 - input_types is a list of types
42 - is_conversion is true if this opcode represents a type conversion
43 - algebraic_properties is a space-seperated string, where nir_op_is_ is
44 prepended before each entry
45 - const_expr is an expression or series of statements that computes the
46 constant value of the opcode given the constant values of its inputs.
47
48 Constant expressions are formed from the variables src0, src1, ...,
49 src(N-1), where N is the number of arguments. The output of the
50 expression should be stored in the dst variable. Per-component input
51 and output variables will be scalars and non-per-component input and
52 output variables will be a struct with fields named x, y, z, and w
53 all of the correct type. Input and output variables can be assumed
54 to already be of the correct type and need no conversion. In
55 particular, the conversion from the C bool type to/from NIR_TRUE and
56 NIR_FALSE happens automatically.
57
58 For per-component instructions, the entire expression will be
59 executed once for each component. For non-per-component
60 instructions, the expression is expected to store the correct values
61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
62 constant expression, an assignment to dst will happen automatically
63 and the result will be equivalent to "dst = <expression>" for
64 per-component instructions and "dst.x = dst.y = ... = <expression>"
65 for non-per-component instructions.
66 """
67 assert isinstance(name, str)
68 assert isinstance(output_size, int)
69 assert isinstance(output_type, str)
70 assert isinstance(input_sizes, list)
71 assert isinstance(input_sizes[0], int)
72 assert isinstance(input_types, list)
73 assert isinstance(input_types[0], str)
74 assert isinstance(is_conversion, bool)
75 assert isinstance(algebraic_properties, str)
76 assert isinstance(const_expr, str)
77 assert len(input_sizes) == len(input_types)
78 assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
79 for size in input_sizes:
80 assert 0 <= size <= 4
81 if output_size != 0:
82 assert size != 0
83 self.name = name
84 self.num_inputs = len(input_sizes)
85 self.output_size = output_size
86 self.output_type = output_type
87 self.input_sizes = input_sizes
88 self.input_types = input_types
89 self.is_conversion = is_conversion
90 self.algebraic_properties = algebraic_properties
91 self.const_expr = const_expr
92
93 # helper variables for strings
94 tfloat = "float"
95 tint = "int"
96 tbool = "bool"
97 tbool1 = "bool1"
98 tbool8 = "bool8"
99 tbool16 = "bool16"
100 tbool32 = "bool32"
101 tuint = "uint"
102 tuint16 = "uint16"
103 tfloat16 = "float16"
104 tfloat32 = "float32"
105 tint32 = "int32"
106 tuint32 = "uint32"
107 tint64 = "int64"
108 tuint64 = "uint64"
109 tfloat64 = "float64"
110
111 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
112
113 def type_has_size(type_):
114 m = _TYPE_SPLIT_RE.match(type_)
115 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
116 return m.group('bits') is not None
117
118 def type_size(type_):
119 m = _TYPE_SPLIT_RE.match(type_)
120 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
121 assert m.group('bits') is not None, \
122 'NIR type string has no bit size: "{}"'.format(type_)
123 return int(m.group('bits'))
124
125 def type_sizes(type_):
126 if type_has_size(type_):
127 return [type_size(type_)]
128 elif type_ == 'bool':
129 return [1, 8, 16, 32]
130 elif type_ == 'float':
131 return [16, 32, 64]
132 else:
133 return [1, 8, 16, 32, 64]
134
135 def type_base_type(type_):
136 m = _TYPE_SPLIT_RE.match(type_)
137 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
138 return m.group('type')
139
140 # Operation where the first two sources are commutative.
141 #
142 # For 2-source operations, this just mathematical commutativity. Some
143 # 3-source operations, like ffma, are only commutative in the first two
144 # sources.
145 _2src_commutative = "2src_commutative "
146 associative = "associative "
147
148 # global dictionary of opcodes
149 opcodes = {}
150
151 def opcode(name, output_size, output_type, input_sizes, input_types,
152 is_conversion, algebraic_properties, const_expr):
153 assert name not in opcodes
154 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
155 input_types, is_conversion, algebraic_properties,
156 const_expr)
157
158 def unop_convert(name, out_type, in_type, const_expr):
159 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
160
161 def unop(name, ty, const_expr):
162 opcode(name, 0, ty, [0], [ty], False, "", const_expr)
163
164 def unop_horiz(name, output_size, output_type, input_size, input_type,
165 const_expr):
166 opcode(name, output_size, output_type, [input_size], [input_type],
167 False, "", const_expr)
168
169 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
170 reduce_expr, final_expr):
171 def prereduce(src):
172 return "(" + prereduce_expr.format(src=src) + ")"
173 def final(src):
174 return final_expr.format(src="(" + src + ")")
175 def reduce_(src0, src1):
176 return reduce_expr.format(src0=src0, src1=src1)
177 src0 = prereduce("src0.x")
178 src1 = prereduce("src0.y")
179 src2 = prereduce("src0.z")
180 src3 = prereduce("src0.w")
181 unop_horiz(name + "2", output_size, output_type, 2, input_type,
182 final(reduce_(src0, src1)))
183 unop_horiz(name + "3", output_size, output_type, 3, input_type,
184 final(reduce_(reduce_(src0, src1), src2)))
185 unop_horiz(name + "4", output_size, output_type, 4, input_type,
186 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
187
188 def unop_numeric_convert(name, out_type, in_type, const_expr):
189 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
190
191 unop("mov", tuint, "src0")
192
193 unop("ineg", tint, "-src0")
194 unop("fneg", tfloat, "-src0")
195 unop("inot", tint, "~src0") # invert every bit of the integer
196 unop("fsign", tfloat, ("bit_size == 64 ? " +
197 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
198 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
199 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
200 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
201 unop("fabs", tfloat, "fabs(src0)")
202 unop("fsat", tfloat, ("bit_size == 64 ? " +
203 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
204 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
205 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
206 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
207 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
208 unop("fexp2", tfloat, "exp2f(src0)")
209 unop("flog2", tfloat, "log2f(src0)")
210
211 # Generate all of the numeric conversion opcodes
212 for src_t in [tint, tuint, tfloat, tbool]:
213 if src_t == tbool:
214 dst_types = [tfloat, tint, tbool]
215 elif src_t == tint:
216 dst_types = [tfloat, tint, tbool]
217 elif src_t == tuint:
218 dst_types = [tfloat, tuint]
219 elif src_t == tfloat:
220 dst_types = [tint, tuint, tfloat, tbool]
221
222 for dst_t in dst_types:
223 for dst_bit_size in type_sizes(dst_t):
224 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
225 rnd_modes = ['_rtne', '_rtz', '']
226 for rnd_mode in rnd_modes:
227 if rnd_mode == '_rtne':
228 conv_expr = """
229 if (bit_size > 16) {
230 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
231 } else {
232 dst = src0;
233 }
234 """
235 elif rnd_mode == '_rtz':
236 conv_expr = """
237 if (bit_size > 16) {
238 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
239 } else {
240 dst = src0;
241 }
242 """
243 else:
244 conv_expr = "src0"
245
246 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
247 dst_t[0],
248 dst_bit_size,
249 rnd_mode),
250 dst_t + str(dst_bit_size),
251 src_t, conv_expr)
252 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
253 conv_expr = """
254 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
255 dst = _mesa_double_to_float_rtz(src0);
256 } else {
257 dst = src0;
258 }
259 """
260 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
261 dst_bit_size),
262 dst_t + str(dst_bit_size), src_t, conv_expr)
263 else:
264 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
265 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
266 dst_bit_size),
267 dst_t + str(dst_bit_size), src_t, conv_expr)
268
269 # Special opcode that is the same as f2f16 except that it is safe to remove it
270 # if the result is immediately converted back to float32 again. This is
271 # generated as part of the precision lowering pass. mp stands for medium
272 # precision.
273 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
274
275 # Unary floating-point rounding operations.
276
277
278 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
279 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
280 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
281 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
282 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
283
284 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
285
286 # Trigonometric operations.
287
288
289 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
290 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
291
292 # dfrexp
293 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
294 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
295
296 # Partial derivatives.
297
298
299 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
300 unop("fddy", tfloat, "0.0")
301 unop("fddx_fine", tfloat, "0.0")
302 unop("fddy_fine", tfloat, "0.0")
303 unop("fddx_coarse", tfloat, "0.0")
304 unop("fddy_coarse", tfloat, "0.0")
305
306
307 # Floating point pack and unpack operations.
308
309 def pack_2x16(fmt):
310 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
311 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
312 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
313 """.replace("fmt", fmt))
314
315 def pack_4x8(fmt):
316 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
317 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
318 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
319 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
320 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
321 """.replace("fmt", fmt))
322
323 def unpack_2x16(fmt):
324 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
325 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
326 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
327 """.replace("fmt", fmt))
328
329 def unpack_4x8(fmt):
330 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
331 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
332 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
333 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
334 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
335 """.replace("fmt", fmt))
336
337
338 pack_2x16("snorm")
339 pack_4x8("snorm")
340 pack_2x16("unorm")
341 pack_4x8("unorm")
342 pack_2x16("half")
343 unpack_2x16("snorm")
344 unpack_4x8("snorm")
345 unpack_2x16("unorm")
346 unpack_4x8("unorm")
347 unpack_2x16("half")
348
349 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
350 dst.x = (src0.x & 0xffff) | (src0.y << 16);
351 """)
352
353 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
354 dst.x = (src0.x << 0) |
355 (src0.y << 8) |
356 (src0.z << 16) |
357 (src0.w << 24);
358 """)
359
360 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
361 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
362
363 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
364 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
365
366 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
367 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
368
369 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
370 "dst.x = src0.x; dst.y = src0.x >> 32;")
371
372 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
373 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
374
375 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
376 "dst.x = src0.x; dst.y = src0.x >> 16;")
377
378 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
379 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
380 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
381 """)
382
383 # Lowered floating point unpacking operations.
384
385 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
386 "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
387 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
388 "unpack_half_1x16((uint16_t)(src0 >> 16))")
389
390 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
391 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
392 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
393 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
394
395 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
396 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
397
398 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
399 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
400
401 # Bit operations, part of ARB_gpu_shader5.
402
403
404 unop("bitfield_reverse", tuint32, """
405 /* we're not winning any awards for speed here, but that's ok */
406 dst = 0;
407 for (unsigned bit = 0; bit < 32; bit++)
408 dst |= ((src0 >> bit) & 1) << (31 - bit);
409 """)
410 unop_convert("bit_count", tuint32, tuint, """
411 dst = 0;
412 for (unsigned bit = 0; bit < bit_size; bit++) {
413 if ((src0 >> bit) & 1)
414 dst++;
415 }
416 """)
417
418 unop_convert("ufind_msb", tint32, tuint, """
419 dst = -1;
420 for (int bit = bit_size - 1; bit >= 0; bit--) {
421 if ((src0 >> bit) & 1) {
422 dst = bit;
423 break;
424 }
425 }
426 """)
427
428 unop("uclz", tuint32, """
429 int bit;
430 for (bit = bit_size - 1; bit >= 0; bit--) {
431 if ((src0 & (1u << bit)) != 0)
432 break;
433 }
434 dst = (unsigned)(31 - bit);
435 """)
436
437 unop("ifind_msb", tint32, """
438 dst = -1;
439 for (int bit = 31; bit >= 0; bit--) {
440 /* If src0 < 0, we're looking for the first 0 bit.
441 * if src0 >= 0, we're looking for the first 1 bit.
442 */
443 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
444 (!((src0 >> bit) & 1) && (src0 < 0))) {
445 dst = bit;
446 break;
447 }
448 }
449 """)
450
451 unop_convert("find_lsb", tint32, tint, """
452 dst = -1;
453 for (unsigned bit = 0; bit < bit_size; bit++) {
454 if ((src0 >> bit) & 1) {
455 dst = bit;
456 break;
457 }
458 }
459 """)
460
461
462 for i in range(1, 5):
463 for j in range(1, 5):
464 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
465
466
467 # AMD_gcn_shader extended instructions
468 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
469 dst.x = dst.y = 0.0;
470 float absX = fabsf(src0.x);
471 float absY = fabsf(src0.y);
472 float absZ = fabsf(src0.z);
473
474 float ma = 0.0;
475 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
476 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
477 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
478
479 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
480 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
481 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
482 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
483 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
484 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
485
486 dst.x = dst.x / ma + 0.5;
487 dst.y = dst.y / ma + 0.5;
488 """)
489
490 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
491 float absX = fabsf(src0.x);
492 float absY = fabsf(src0.y);
493 float absZ = fabsf(src0.z);
494 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
495 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
496 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
497 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
498 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
499 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
500 """)
501
502 # Sum of vector components
503 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
504
505 def binop_convert(name, out_type, in_type, alg_props, const_expr):
506 opcode(name, 0, out_type, [0, 0], [in_type, in_type],
507 False, alg_props, const_expr)
508
509 def binop(name, ty, alg_props, const_expr):
510 binop_convert(name, ty, ty, alg_props, const_expr)
511
512 def binop_compare(name, ty, alg_props, const_expr):
513 binop_convert(name, tbool1, ty, alg_props, const_expr)
514
515 def binop_compare8(name, ty, alg_props, const_expr):
516 binop_convert(name, tbool8, ty, alg_props, const_expr)
517
518 def binop_compare16(name, ty, alg_props, const_expr):
519 binop_convert(name, tbool16, ty, alg_props, const_expr)
520
521 def binop_compare32(name, ty, alg_props, const_expr):
522 binop_convert(name, tbool32, ty, alg_props, const_expr)
523
524 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
525 binop_compare(name, ty, alg_props, const_expr)
526 binop_compare8(name + "8", ty, alg_props, const_expr)
527 binop_compare16(name + "16", ty, alg_props, const_expr)
528 binop_compare32(name + "32", ty, alg_props, const_expr)
529
530 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
531 src2_type, const_expr):
532 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
533 False, "", const_expr)
534
535 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
536 reduce_expr, final_expr):
537 def final(src):
538 return final_expr.format(src= "(" + src + ")")
539 def reduce_(src0, src1):
540 return reduce_expr.format(src0=src0, src1=src1)
541 def prereduce(src0, src1):
542 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
543 src0 = prereduce("src0.x", "src1.x")
544 src1 = prereduce("src0.y", "src1.y")
545 src2 = prereduce("src0.z", "src1.z")
546 src3 = prereduce("src0.w", "src1.w")
547 opcode(name + "2", output_size, output_type,
548 [2, 2], [src_type, src_type], False, _2src_commutative,
549 final(reduce_(src0, src1)))
550 opcode(name + "3", output_size, output_type,
551 [3, 3], [src_type, src_type], False, _2src_commutative,
552 final(reduce_(reduce_(src0, src1), src2)))
553 opcode(name + "4", output_size, output_type,
554 [4, 4], [src_type, src_type], False, _2src_commutative,
555 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
556
557 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
558 reduce_expr, final_expr):
559 binop_reduce(name, output_size, tbool1, src_type,
560 prereduce_expr, reduce_expr, final_expr)
561 binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
562 prereduce_expr, reduce_expr, final_expr)
563 binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
564 prereduce_expr, reduce_expr, final_expr)
565 binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
566 prereduce_expr, reduce_expr, final_expr)
567
568 binop("fadd", tfloat, _2src_commutative + associative,"""
569 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
570 if (bit_size == 64)
571 dst = _mesa_double_add_rtz(src0, src1);
572 else
573 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
574 } else {
575 dst = src0 + src1;
576 }
577 """)
578 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
579 binop("iadd_sat", tint, _2src_commutative, """
580 src1 > 0 ?
581 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
582 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1)
583 """)
584 binop("uadd_sat", tuint, _2src_commutative,
585 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
586 binop("isub_sat", tint, "", """
587 src1 < 0 ?
588 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
589 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1)
590 """)
591 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
592
593 binop("fsub", tfloat, "", """
594 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
595 if (bit_size == 64)
596 dst = _mesa_double_sub_rtz(src0, src1);
597 else
598 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
599 } else {
600 dst = src0 - src1;
601 }
602 """)
603 binop("isub", tint, "", "src0 - src1")
604 binop_convert("uabs_isub", tuint, tint, "", """
605 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
606 : (uint64_t) src0 - (uint64_t) src1
607 """)
608 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
609
610 binop("fmul", tfloat, _2src_commutative + associative, """
611 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
612 if (bit_size == 64)
613 dst = _mesa_double_mul_rtz(src0, src1);
614 else
615 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
616 } else {
617 dst = src0 * src1;
618 }
619 """)
620 # low 32-bits of signed/unsigned integer multiply
621 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
622
623 # Generate 64 bit result from 2 32 bits quantity
624 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
625 "(int64_t)src0 * (int64_t)src1")
626 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
627 "(uint64_t)src0 * (uint64_t)src1")
628
629 # high 32-bits of signed integer multiply
630 binop("imul_high", tint, _2src_commutative, """
631 if (bit_size == 64) {
632 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
633 * extension to work properly. The casts are kind-of annoying but needed
634 * to prevent compiler warnings.
635 */
636 uint32_t src0_u32[4] = {
637 src0,
638 (int64_t)src0 >> 32,
639 (int64_t)src0 >> 63,
640 (int64_t)src0 >> 63,
641 };
642 uint32_t src1_u32[4] = {
643 src1,
644 (int64_t)src1 >> 32,
645 (int64_t)src1 >> 63,
646 (int64_t)src1 >> 63,
647 };
648 uint32_t prod_u32[4];
649 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
650 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
651 } else {
652 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
653 }
654 """)
655
656 # high 32-bits of unsigned integer multiply
657 binop("umul_high", tuint, _2src_commutative, """
658 if (bit_size == 64) {
659 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
660 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
661 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
662 uint32_t prod_u32[4];
663 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
664 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
665 } else {
666 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
667 }
668 """)
669
670 # low 32-bits of unsigned integer multiply
671 binop("umul_low", tuint32, _2src_commutative, """
672 uint64_t mask = (1 << (bit_size / 2)) - 1;
673 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
674 """)
675
676 # Multiply 32-bits with low 16-bits.
677 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
678 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
679
680 binop("fdiv", tfloat, "", "src0 / src1")
681 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
682 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
683
684 # returns a boolean representing the carry resulting from the addition of
685 # the two unsigned arguments.
686
687 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
688
689 # returns a boolean representing the borrow resulting from the subtraction
690 # of the two unsigned arguments.
691
692 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
693
694 # hadd: (a + b) >> 1 (without overflow)
695 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
696 # = (x & y) + (x & ~y) + (x & y) + (~x & y)
697 # = 2 * (x & y) + (x & ~y) + (~x & y)
698 # = ((x & y) << 1) + (x ^ y)
699 #
700 # Since we know that the bottom bit of (x & y) << 1 is zero,
701 #
702 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
703 # = (x & y) + ((x ^ y) >> 1)
704 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
705 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
706
707 # rhadd: (a + b + 1) >> 1 (without overflow)
708 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
709 # = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
710 # = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
711 # = ((x | y) << 1) - (x ^ y) + 1
712 #
713 # Since we know that the bottom bit of (x & y) << 1 is zero,
714 #
715 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
716 # = (x | y) - ((x ^ y) >> 1)
717 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
718 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
719
720 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
721
722 # For signed integers, there are several different possible definitions of
723 # "modulus" or "remainder". We follow the conventions used by LLVM and
724 # SPIR-V. The irem opcode implements the standard C/C++ signed "%"
725 # operation while the imod opcode implements the more mathematical
726 # "modulus" operation. For details on the difference, see
727 #
728 # http://mathforum.org/library/drmath/view/52343.html
729
730 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
731 binop("imod", tint, "",
732 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
733 " src0 % src1 : src0 % src1 + src1)")
734 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
735 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
736
737 #
738 # Comparisons
739 #
740
741
742 # these integer-aware comparisons return a boolean (0 or ~0)
743
744 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
745 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
746 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
747 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
748 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
749 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
750 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
751 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
752 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
753 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
754
755 # integer-aware GLSL-style comparisons that compare floats and ints
756
757 binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}",
758 "{src0} && {src1}", "{src}")
759 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
760 "{src0} || {src1}", "{src}")
761 binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}",
762 "{src0} && {src1}", "{src}")
763 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
764 "{src0} || {src1}", "{src}")
765
766 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
767
768 binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
769 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
770 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
771 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
772
773 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
774 # and false respectively
775
776 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
777 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
778 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
779 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
780
781 # SPIRV shifts are undefined for shift-operands >= bitsize,
782 # but SM5 shifts are defined to use the least significant bits, only
783 # The NIR definition is according to the SM5 specification.
784 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
785 "src0 << (src1 & (sizeof(src0) * 8 - 1))")
786 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
787 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
788 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
789 "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
790
791 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
792 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
793 dst = (src0 << (src1 & rotate_mask)) |
794 (src0 >> (-src1 & rotate_mask));
795 """)
796 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
797 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
798 dst = (src0 >> (src1 & rotate_mask)) |
799 (src0 << (-src1 & rotate_mask));
800 """)
801
802 # bitwise logic operators
803 #
804 # These are also used as boolean and, or, xor for hardware supporting
805 # integers.
806
807
808 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
809 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
810 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
811
812
813 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
814 "{src}")
815
816 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
817 "{src0} * {src1}", "{src0} + {src1}", "{src}")
818
819 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
820 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
821 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
822 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
823
824 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
825 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
826 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
827 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
828 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
829 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
830
831 # Saturated vector add for 4 8bit ints.
832 binop("usadd_4x8", tint32, _2src_commutative + associative, """
833 dst = 0;
834 for (int i = 0; i < 32; i += 8) {
835 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
836 }
837 """)
838
839 # Saturated vector subtract for 4 8bit ints.
840 binop("ussub_4x8", tint32, "", """
841 dst = 0;
842 for (int i = 0; i < 32; i += 8) {
843 int src0_chan = (src0 >> i) & 0xff;
844 int src1_chan = (src1 >> i) & 0xff;
845 if (src0_chan > src1_chan)
846 dst |= (src0_chan - src1_chan) << i;
847 }
848 """)
849
850 # vector min for 4 8bit ints.
851 binop("umin_4x8", tint32, _2src_commutative + associative, """
852 dst = 0;
853 for (int i = 0; i < 32; i += 8) {
854 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
855 }
856 """)
857
858 # vector max for 4 8bit ints.
859 binop("umax_4x8", tint32, _2src_commutative + associative, """
860 dst = 0;
861 for (int i = 0; i < 32; i += 8) {
862 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
863 }
864 """)
865
866 # unorm multiply: (a * b) / 255.
867 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
868 dst = 0;
869 for (int i = 0; i < 32; i += 8) {
870 int src0_chan = (src0 >> i) & 0xff;
871 int src1_chan = (src1 >> i) & 0xff;
872 dst |= ((src0_chan * src1_chan) / 255) << i;
873 }
874 """)
875
876 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
877
878 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
879 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
880
881 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
882 "src0 | ((uint64_t)src1 << 32)")
883
884 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
885 "src0 | ((uint32_t)src1 << 16)")
886
887 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
888 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
889 # are from the low five bits of src0 and src1, respectively.
890 binop_convert("bfm", tuint32, tint32, "", """
891 int bits = src0 & 0x1F;
892 int offset = src1 & 0x1F;
893 dst = ((1u << bits) - 1) << offset;
894 """)
895
896 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
897 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
898 /* flush denormals to zero. */
899 if (!isnormal(dst))
900 dst = copysignf(0.0f, src0);
901 """)
902
903 # Combines the first component of each input to make a 2-component vector.
904
905 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
906 dst.x = src0.x;
907 dst.y = src1.x;
908 """)
909
910 # Byte extraction
911 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
912 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
913
914 # Word extraction
915 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
916 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
917
918
919 def triop(name, ty, alg_props, const_expr):
920 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
921 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
922 opcode(name, output_size, tuint,
923 [src1_size, src2_size, src3_size],
924 [tuint, tuint, tuint], False, "", const_expr)
925
926 triop("ffma", tfloat, _2src_commutative, """
927 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
928 if (bit_size == 64)
929 dst = _mesa_double_fma_rtz(src0, src1, src2);
930 else if (bit_size == 32)
931 dst = _mesa_float_fma_rtz(src0, src1, src2);
932 else
933 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
934 } else {
935 if (bit_size == 32)
936 dst = fmaf(src0, src1, src2);
937 else
938 dst = fma(src0, src1, src2);
939 }
940 """)
941
942 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
943
944 # Conditional Select
945 #
946 # A vector conditional select instruction (like ?:, but operating per-
947 # component on vectors). There are two versions, one for floating point
948 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
949
950
951 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
952
953 # 3 way min/max/med
954 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
955 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
956 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
957
958 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
959 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
960 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
961
962 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
963 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
964 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
965
966 opcode("bcsel", 0, tuint, [0, 0, 0],
967 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
968 opcode("b8csel", 0, tuint, [0, 0, 0],
969 [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
970 opcode("b16csel", 0, tuint, [0, 0, 0],
971 [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
972 opcode("b32csel", 0, tuint, [0, 0, 0],
973 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
974
975 # SM5 bfi assembly
976 triop("bfi", tuint32, "", """
977 unsigned mask = src0, insert = src1, base = src2;
978 if (mask == 0) {
979 dst = base;
980 } else {
981 unsigned tmp = mask;
982 while (!(tmp & 1)) {
983 tmp >>= 1;
984 insert <<= 1;
985 }
986 dst = (base & ~mask) | (insert & mask);
987 }
988 """)
989
990
991 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
992
993 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
994 opcode("ubfe", 0, tuint32,
995 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
996 unsigned base = src0;
997 unsigned offset = src1 & 0x1F;
998 unsigned bits = src2 & 0x1F;
999 if (bits == 0) {
1000 dst = 0;
1001 } else if (offset + bits < 32) {
1002 dst = (base << (32 - bits - offset)) >> (32 - bits);
1003 } else {
1004 dst = base >> offset;
1005 }
1006 """)
1007 opcode("ibfe", 0, tint32,
1008 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1009 int base = src0;
1010 unsigned offset = src1 & 0x1F;
1011 unsigned bits = src2 & 0x1F;
1012 if (bits == 0) {
1013 dst = 0;
1014 } else if (offset + bits < 32) {
1015 dst = (base << (32 - bits - offset)) >> (32 - bits);
1016 } else {
1017 dst = base >> offset;
1018 }
1019 """)
1020
1021 # GLSL bitfieldExtract()
1022 opcode("ubitfield_extract", 0, tuint32,
1023 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1024 unsigned base = src0;
1025 int offset = src1, bits = src2;
1026 if (bits == 0) {
1027 dst = 0;
1028 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1029 dst = 0; /* undefined per the spec */
1030 } else {
1031 dst = (base >> offset) & ((1ull << bits) - 1);
1032 }
1033 """)
1034 opcode("ibitfield_extract", 0, tint32,
1035 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1036 int base = src0;
1037 int offset = src1, bits = src2;
1038 if (bits == 0) {
1039 dst = 0;
1040 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1041 dst = 0;
1042 } else {
1043 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1044 }
1045 """)
1046
1047 # Combines the first component of each input to make a 3-component vector.
1048
1049 triop_horiz("vec3", 3, 1, 1, 1, """
1050 dst.x = src0.x;
1051 dst.y = src1.x;
1052 dst.z = src2.x;
1053 """)
1054
1055 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1056 src4_size, const_expr):
1057 opcode(name, output_size, tuint,
1058 [src1_size, src2_size, src3_size, src4_size],
1059 [tuint, tuint, tuint, tuint],
1060 False, "", const_expr)
1061
1062 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1063 [tuint32, tuint32, tint32, tint32], False, "", """
1064 unsigned base = src0, insert = src1;
1065 int offset = src2, bits = src3;
1066 if (bits == 0) {
1067 dst = base;
1068 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1069 dst = 0;
1070 } else {
1071 unsigned mask = ((1ull << bits) - 1) << offset;
1072 dst = (base & ~mask) | ((insert << offset) & mask);
1073 }
1074 """)
1075
1076 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1077 dst.x = src0.x;
1078 dst.y = src1.x;
1079 dst.z = src2.x;
1080 dst.w = src3.x;
1081 """)
1082
1083 opcode("vec8", 8, tuint,
1084 [1] * 8, [tuint] * 8,
1085 False, "", """
1086 dst.x = src0.x;
1087 dst.y = src1.x;
1088 dst.z = src2.x;
1089 dst.w = src3.x;
1090 dst.e = src4.x;
1091 dst.f = src5.x;
1092 dst.g = src6.x;
1093 dst.h = src7.x;
1094 """)
1095
1096 opcode("vec16", 16, tuint,
1097 [1] * 16, [tuint] * 16,
1098 False, "", """
1099 dst.x = src0.x;
1100 dst.y = src1.x;
1101 dst.z = src2.x;
1102 dst.w = src3.x;
1103 dst.e = src4.x;
1104 dst.f = src5.x;
1105 dst.g = src6.x;
1106 dst.h = src7.x;
1107 dst.i = src8.x;
1108 dst.j = src9.x;
1109 dst.k = src10.x;
1110 dst.l = src11.x;
1111 dst.m = src12.x;
1112 dst.n = src13.x;
1113 dst.o = src14.x;
1114 dst.p = src15.x;
1115 """)
1116
1117 # An integer multiply instruction for address calculation. This is
1118 # similar to imul, except that the results are undefined in case of
1119 # overflow. Overflow is defined according to the size of the variable
1120 # being dereferenced.
1121 #
1122 # This relaxed definition, compared to imul, allows an optimization
1123 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1124 # sources, such that lower precision integer multiplies can be used.
1125 # This is useful on hw that has 24b or perhaps 16b integer multiply
1126 # instructions.
1127 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1128
1129 # ir3-specific instruction that maps directly to mul-add shift high mix,
1130 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1131 # multiplication (imul) on Freedreno backend..
1132 opcode("imadsh_mix16", 1, tint32,
1133 [1, 1, 1], [tint32, tint32, tint32], False, "", """
1134 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1135 """)
1136
1137 # ir3-specific instruction that maps directly to ir3 mad.s24.
1138 #
1139 # 24b multiply into 32b result (with sign extension) plus 32b int
1140 triop("imad24_ir3", tint32, _2src_commutative,
1141 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1142
1143 # 24b multiply into 32b result (with sign extension)
1144 binop("imul24", tint32, _2src_commutative + associative,
1145 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")