src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool32 = "bool32"
  99 tuint = "uint"
 100 tuint16 = "uint16"
 101 tfloat32 = "float32"
 102 tint32 = "int32"
 103 tuint32 = "uint32"
 104 tint64 = "int64"
 105 tuint64 = "uint64"
 106 tfloat64 = "float64"
 107
 108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 109
 110 def type_has_size(type_):
 111     m = _TYPE_SPLIT_RE.match(type_)
 112     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 113     return m.group('bits') is not None
 114
 115 def type_size(type_):
 116     m = _TYPE_SPLIT_RE.match(type_)
 117     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 118     assert m.group('bits') is not None, \
 119            'NIR type string has no bit size: "{}"'.format(type_)
 120     return int(m.group('bits'))
 121
 122 def type_sizes(type_):
 123     if type_has_size(type_):
 124         return [type_size(type_)]
 125     elif type_ == 'bool':
 126         return [1, 32]
 127     elif type_ == 'float':
 128         return [16, 32, 64]
 129     else:
 130         return [1, 8, 16, 32, 64]
 131
 132 def type_base_type(type_):
 133     m = _TYPE_SPLIT_RE.match(type_)
 134     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 135     return m.group('type')
 136
 137 # Operation where the first two sources are commutative.
 138 #
 139 # For 2-source operations, this just mathematical commutativity.  Some
 140 # 3-source operations, like ffma, are only commutative in the first two
 141 # sources.
 142 _2src_commutative = "2src_commutative "
 143 associative = "associative "
 144
 145 # global dictionary of opcodes
 146 opcodes = {}
 147
 148 def opcode(name, output_size, output_type, input_sizes, input_types,
 149            is_conversion, algebraic_properties, const_expr):
 150    assert name not in opcodes
 151    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 152                           input_types, is_conversion, algebraic_properties,
 153                           const_expr)
 154
 155 def unop_convert(name, out_type, in_type, const_expr):
 156    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 157
 158 def unop(name, ty, const_expr):
 159    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 160
 161 def unop_horiz(name, output_size, output_type, input_size, input_type,
 162                const_expr):
 163    opcode(name, output_size, output_type, [input_size], [input_type],
 164           False, "", const_expr)
 165
 166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 167                 reduce_expr, final_expr):
 168    def prereduce(src):
 169       return "(" + prereduce_expr.format(src=src) + ")"
 170    def final(src):
 171       return final_expr.format(src="(" + src + ")")
 172    def reduce_(src0, src1):
 173       return reduce_expr.format(src0=src0, src1=src1)
 174    src0 = prereduce("src0.x")
 175    src1 = prereduce("src0.y")
 176    src2 = prereduce("src0.z")
 177    src3 = prereduce("src0.w")
 178    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 179               final(reduce_(src0, src1)))
 180    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 181               final(reduce_(reduce_(src0, src1), src2)))
 182    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 183               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 184
 185 def unop_numeric_convert(name, out_type, in_type, const_expr):
 186    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 187
 188 unop("mov", tuint, "src0")
 189
 190 unop("ineg", tint, "-src0")
 191 unop("fneg", tfloat, "-src0")
 192 unop("inot", tint, "~src0") # invert every bit of the integer
 193 unop("fsign", tfloat, ("bit_size == 64 ? " +
 194                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 195                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 198 unop("fabs", tfloat, "fabs(src0)")
 199 unop("fsat", tfloat, ("bit_size == 64 ? " +
 200                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 201                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 205 unop("fexp2", tfloat, "exp2f(src0)")
 206 unop("flog2", tfloat, "log2f(src0)")
 207
 208 # Generate all of the numeric conversion opcodes
 209 for src_t in [tint, tuint, tfloat, tbool]:
 210    if src_t == tbool:
 211       dst_types = [tfloat, tint]
 212    elif src_t == tint:
 213       dst_types = [tfloat, tint, tbool]
 214    elif src_t == tuint:
 215       dst_types = [tfloat, tuint]
 216    elif src_t == tfloat:
 217       dst_types = [tint, tuint, tfloat, tbool]
 218
 219    for dst_t in dst_types:
 220       for bit_size in type_sizes(dst_t):
 221           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 222               rnd_modes = ['_rtne', '_rtz', '']
 223               for rnd_mode in rnd_modes:
 224                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
 225                                                               bit_size, rnd_mode),
 226                                        dst_t + str(bit_size), src_t, "src0")
 227           else:
 228               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 229               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 230                                    dst_t + str(bit_size), src_t, conv_expr)
 231
 232
 233 # Unary floating-point rounding operations.
 234
 235
 236 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 237 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 238 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 239 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 240 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 241
 242 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 243
 244 # Trigonometric operations.
 245
 246
 247 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 248 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 249
 250 # dfrexp
 251 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 252 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 253
 254 # Partial derivatives.
 255
 256
 257 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 258 unop("fddy", tfloat, "0.0")
 259 unop("fddx_fine", tfloat, "0.0")
 260 unop("fddy_fine", tfloat, "0.0")
 261 unop("fddx_coarse", tfloat, "0.0")
 262 unop("fddy_coarse", tfloat, "0.0")
 263
 264
 265 # Floating point pack and unpack operations.
 266
 267 def pack_2x16(fmt):
 268    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 269 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 270 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 271 """.replace("fmt", fmt))
 272
 273 def pack_4x8(fmt):
 274    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 275 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 276 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 277 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 278 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 279 """.replace("fmt", fmt))
 280
 281 def unpack_2x16(fmt):
 282    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 283 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 284 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 285 """.replace("fmt", fmt))
 286
 287 def unpack_4x8(fmt):
 288    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 289 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 290 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 291 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 292 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 293 """.replace("fmt", fmt))
 294
 295
 296 pack_2x16("snorm")
 297 pack_4x8("snorm")
 298 pack_2x16("unorm")
 299 pack_4x8("unorm")
 300 pack_2x16("half")
 301 unpack_2x16("snorm")
 302 unpack_4x8("snorm")
 303 unpack_2x16("unorm")
 304 unpack_4x8("unorm")
 305 unpack_2x16("half")
 306
 307 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 308 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 309 """)
 310
 311 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 312 dst.x = (src0.x <<  0) |
 313         (src0.y <<  8) |
 314         (src0.z << 16) |
 315         (src0.w << 24);
 316 """)
 317
 318 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 319            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 320
 321 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 322            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 323
 324 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 325            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 326
 327 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 328            "dst.x = src0.x; dst.y = src0.x >> 32;")
 329
 330 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 331            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 332
 333 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 334            "dst.x = src0.x; dst.y = src0.x >> 16;")
 335
 336 # Lowered floating point unpacking operations.
 337
 338
 339 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 340              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 341 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 342              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 343
 344 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 345 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 346
 347 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 348 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 349
 350 # Bit operations, part of ARB_gpu_shader5.
 351
 352
 353 unop("bitfield_reverse", tuint32, """
 354 /* we're not winning any awards for speed here, but that's ok */
 355 dst = 0;
 356 for (unsigned bit = 0; bit < 32; bit++)
 357    dst |= ((src0 >> bit) & 1) << (31 - bit);
 358 """)
 359 unop_convert("bit_count", tuint32, tuint, """
 360 dst = 0;
 361 for (unsigned bit = 0; bit < bit_size; bit++) {
 362    if ((src0 >> bit) & 1)
 363       dst++;
 364 }
 365 """)
 366
 367 unop_convert("ufind_msb", tint32, tuint, """
 368 dst = -1;
 369 for (int bit = bit_size - 1; bit >= 0; bit--) {
 370    if ((src0 >> bit) & 1) {
 371       dst = bit;
 372       break;
 373    }
 374 }
 375 """)
 376
 377 unop("ifind_msb", tint32, """
 378 dst = -1;
 379 for (int bit = 31; bit >= 0; bit--) {
 380    /* If src0 < 0, we're looking for the first 0 bit.
 381     * if src0 >= 0, we're looking for the first 1 bit.
 382     */
 383    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 384       (!((src0 >> bit) & 1) && (src0 < 0))) {
 385       dst = bit;
 386       break;
 387    }
 388 }
 389 """)
 390
 391 unop_convert("find_lsb", tint32, tint, """
 392 dst = -1;
 393 for (unsigned bit = 0; bit < bit_size; bit++) {
 394    if ((src0 >> bit) & 1) {
 395       dst = bit;
 396       break;
 397    }
 398 }
 399 """)
 400
 401
 402 for i in range(1, 5):
 403    for j in range(1, 5):
 404       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 405
 406
 407 # AMD_gcn_shader extended instructions
 408 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 409 dst.x = dst.y = 0.0;
 410 float absX = fabs(src0.x);
 411 float absY = fabs(src0.y);
 412 float absZ = fabs(src0.z);
 413
 414 float ma = 0.0;
 415 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 416 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 417 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 418
 419 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 420 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 421 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 422 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 423 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 424 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 425
 426 dst.x = dst.x / ma + 0.5;
 427 dst.y = dst.y / ma + 0.5;
 428 """)
 429
 430 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 431 float absX = fabs(src0.x);
 432 float absY = fabs(src0.y);
 433 float absZ = fabs(src0.z);
 434 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 435 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 436 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 437 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 438 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 439 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 440 """)
 441
 442
 443 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 444    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 445           False, alg_props, const_expr)
 446
 447 def binop(name, ty, alg_props, const_expr):
 448    binop_convert(name, ty, ty, alg_props, const_expr)
 449
 450 def binop_compare(name, ty, alg_props, const_expr):
 451    binop_convert(name, tbool1, ty, alg_props, const_expr)
 452
 453 def binop_compare32(name, ty, alg_props, const_expr):
 454    binop_convert(name, tbool32, ty, alg_props, const_expr)
 455
 456 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 457                 src2_type, const_expr):
 458    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 459           False, "", const_expr)
 460
 461 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 462                  reduce_expr, final_expr):
 463    def final(src):
 464       return final_expr.format(src= "(" + src + ")")
 465    def reduce_(src0, src1):
 466       return reduce_expr.format(src0=src0, src1=src1)
 467    def prereduce(src0, src1):
 468       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 469    src0 = prereduce("src0.x", "src1.x")
 470    src1 = prereduce("src0.y", "src1.y")
 471    src2 = prereduce("src0.z", "src1.z")
 472    src3 = prereduce("src0.w", "src1.w")
 473    opcode(name + "2", output_size, output_type,
 474           [2, 2], [src_type, src_type], False, _2src_commutative,
 475           final(reduce_(src0, src1)))
 476    opcode(name + "3", output_size, output_type,
 477           [3, 3], [src_type, src_type], False, _2src_commutative,
 478           final(reduce_(reduce_(src0, src1), src2)))
 479    opcode(name + "4", output_size, output_type,
 480           [4, 4], [src_type, src_type], False, _2src_commutative,
 481           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 482
 483 binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1")
 484 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 485 binop("iadd_sat", tint, _2src_commutative, """
 486       src1 > 0 ?
 487          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 488          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 489 """)
 490 binop("uadd_sat", tuint, _2src_commutative,
 491       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 492 binop("isub_sat", tint, "", """
 493       src1 < 0 ?
 494          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 495          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 496 """)
 497 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 498
 499 binop("fsub", tfloat, "", "src0 - src1")
 500 binop("isub", tint, "", "src0 - src1")
 501
 502 binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1")
 503 # low 32-bits of signed/unsigned integer multiply
 504 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 505
 506 # Generate 64 bit result from 2 32 bits quantity
 507 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 508               "(int64_t)src0 * (int64_t)src1")
 509 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 510               "(uint64_t)src0 * (uint64_t)src1")
 511
 512 # high 32-bits of signed integer multiply
 513 binop("imul_high", tint, _2src_commutative, """
 514 if (bit_size == 64) {
 515    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 516     * extension to work properly.  The casts are kind-of annoying but needed
 517     * to prevent compiler warnings.
 518     */
 519    uint32_t src0_u32[4] = {
 520       src0,
 521       (int64_t)src0 >> 32,
 522       (int64_t)src0 >> 63,
 523       (int64_t)src0 >> 63,
 524    };
 525    uint32_t src1_u32[4] = {
 526       src1,
 527       (int64_t)src1 >> 32,
 528       (int64_t)src1 >> 63,
 529       (int64_t)src1 >> 63,
 530    };
 531    uint32_t prod_u32[4];
 532    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 533    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 534 } else {
 535    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 536 }
 537 """)
 538
 539 # high 32-bits of unsigned integer multiply
 540 binop("umul_high", tuint, _2src_commutative, """
 541 if (bit_size == 64) {
 542    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 543    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 544    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 545    uint32_t prod_u32[4];
 546    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 547    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 548 } else {
 549    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 550 }
 551 """)
 552
 553 # low 32-bits of unsigned integer multiply
 554 binop("umul_low", tuint32, _2src_commutative, """
 555 uint64_t mask = (1 << (bit_size / 2)) - 1;
 556 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 557 """)
 558
 559
 560 binop("fdiv", tfloat, "", "src0 / src1")
 561 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 562 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 563
 564 # returns a boolean representing the carry resulting from the addition of
 565 # the two unsigned arguments.
 566
 567 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 568
 569 # returns a boolean representing the borrow resulting from the subtraction
 570 # of the two unsigned arguments.
 571
 572 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 573
 574 # hadd: (a + b) >> 1 (without overflow)
 575 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 576 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 577 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 578 #       =     ((x & y) << 1) + (x ^ y)
 579 #
 580 # Since we know that the bottom bit of (x & y) << 1 is zero,
 581 #
 582 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 583 #              =   (x & y) +      ((x ^ y)  >> 1)
 584 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 585 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 586
 587 # rhadd: (a + b + 1) >> 1 (without overflow)
 588 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 589 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 590 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 591 #           =     ((x | y) << 1) - (x ^ y) + 1
 592 #
 593 # Since we know that the bottom bit of (x & y) << 1 is zero,
 594 #
 595 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 596 #                  = (x | y) -  ((x ^ y)      >> 1)
 597 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 598 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 599
 600 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 601
 602 # For signed integers, there are several different possible definitions of
 603 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 604 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 605 # operation while the imod opcode implements the more mathematical
 606 # "modulus" operation.  For details on the difference, see
 607 #
 608 # http://mathforum.org/library/drmath/view/52343.html
 609
 610 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 611 binop("imod", tint, "",
 612       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 613       "                 src0 % src1 : src0 % src1 + src1)")
 614 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 615 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 616
 617 #
 618 # Comparisons
 619 #
 620
 621
 622 # these integer-aware comparisons return a boolean (0 or ~0)
 623
 624 binop_compare("flt", tfloat, "", "src0 < src1")
 625 binop_compare("fge", tfloat, "", "src0 >= src1")
 626 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
 627 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
 628 binop_compare("ilt", tint, "", "src0 < src1")
 629 binop_compare("ige", tint, "", "src0 >= src1")
 630 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
 631 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
 632 binop_compare("ult", tuint, "", "src0 < src1")
 633 binop_compare("uge", tuint, "", "src0 >= src1")
 634 binop_compare32("flt32", tfloat, "", "src0 < src1")
 635 binop_compare32("fge32", tfloat, "", "src0 >= src1")
 636 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
 637 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
 638 binop_compare32("ilt32", tint, "", "src0 < src1")
 639 binop_compare32("ige32", tint, "", "src0 >= src1")
 640 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
 641 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
 642 binop_compare32("ult32", tuint, "", "src0 < src1")
 643 binop_compare32("uge32", tuint, "", "src0 >= src1")
 644
 645 # integer-aware GLSL-style comparisons that compare floats and ints
 646
 647 binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
 648              "{src0} && {src1}", "{src}")
 649 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
 650              "{src0} || {src1}", "{src}")
 651 binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
 652              "{src0} && {src1}", "{src}")
 653 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
 654              "{src0} || {src1}", "{src}")
 655
 656 binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 657              "{src0} && {src1}", "{src}")
 658 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 659              "{src0} || {src1}", "{src}")
 660 binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
 661              "{src0} && {src1}", "{src}")
 662 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
 663              "{src0} || {src1}", "{src}")
 664
 665 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 666
 667 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 668              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 669 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 670              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 671
 672 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 673 # and false respectively
 674
 675 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 676 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 677 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 678 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 679
 680 # SPIRV shifts are undefined for shift-operands >= bitsize,
 681 # but SM5 shifts are defined to use the least significant bits, only
 682 # The NIR definition is according to the SM5 specification.
 683 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 684        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 685 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 686        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 687 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 688        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 689
 690 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 691    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 692    dst = (src0 << (src1 & rotate_mask)) |
 693          (src0 >> (-src1 & rotate_mask));
 694 """)
 695 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 696    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 697    dst = (src0 >> (src1 & rotate_mask)) |
 698          (src0 << (-src1 & rotate_mask));
 699 """)
 700
 701 # bitwise logic operators
 702 #
 703 # These are also used as boolean and, or, xor for hardware supporting
 704 # integers.
 705
 706
 707 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 708 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 709 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 710
 711
 712 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 713              "{src}")
 714
 715 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 716              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 717
 718 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 719        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 720 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 721        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 722
 723 binop("fmin", tfloat, "", "fminf(src0, src1)")
 724 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 725 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 726 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 727 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 728 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 729
 730 # Saturated vector add for 4 8bit ints.
 731 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 732 dst = 0;
 733 for (int i = 0; i < 32; i += 8) {
 734    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 735 }
 736 """)
 737
 738 # Saturated vector subtract for 4 8bit ints.
 739 binop("ussub_4x8", tint32, "", """
 740 dst = 0;
 741 for (int i = 0; i < 32; i += 8) {
 742    int src0_chan = (src0 >> i) & 0xff;
 743    int src1_chan = (src1 >> i) & 0xff;
 744    if (src0_chan > src1_chan)
 745       dst |= (src0_chan - src1_chan) << i;
 746 }
 747 """)
 748
 749 # vector min for 4 8bit ints.
 750 binop("umin_4x8", tint32, _2src_commutative + associative, """
 751 dst = 0;
 752 for (int i = 0; i < 32; i += 8) {
 753    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 754 }
 755 """)
 756
 757 # vector max for 4 8bit ints.
 758 binop("umax_4x8", tint32, _2src_commutative + associative, """
 759 dst = 0;
 760 for (int i = 0; i < 32; i += 8) {
 761    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 762 }
 763 """)
 764
 765 # unorm multiply: (a * b) / 255.
 766 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 767 dst = 0;
 768 for (int i = 0; i < 32; i += 8) {
 769    int src0_chan = (src0 >> i) & 0xff;
 770    int src1_chan = (src1 >> i) & 0xff;
 771    dst |= ((src0_chan * src1_chan) / 255) << i;
 772 }
 773 """)
 774
 775 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 776
 777 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 778             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 779
 780 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 781               "src0 | ((uint64_t)src1 << 32)")
 782
 783 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 784               "src0 | ((uint32_t)src1 << 16)")
 785
 786 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 787 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 788 # are from the low five bits of src0 and src1, respectively.
 789 binop_convert("bfm", tuint32, tint32, "", """
 790 int bits = src0 & 0x1F;
 791 int offset = src1 & 0x1F;
 792 dst = ((1u << bits) - 1) << offset;
 793 """)
 794
 795 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 796 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 797 /* flush denormals to zero. */
 798 if (!isnormal(dst))
 799    dst = copysignf(0.0f, src0);
 800 """)
 801
 802 # Combines the first component of each input to make a 2-component vector.
 803
 804 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 805 dst.x = src0.x;
 806 dst.y = src1.x;
 807 """)
 808
 809 # Byte extraction
 810 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 811 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 812
 813 # Word extraction
 814 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 815 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 816
 817
 818 def triop(name, ty, alg_props, const_expr):
 819    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 820 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 821    opcode(name, output_size, tuint,
 822    [src1_size, src2_size, src3_size],
 823    [tuint, tuint, tuint], False, "", const_expr)
 824
 825 triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2")
 826
 827 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 828
 829 # Conditional Select
 830 #
 831 # A vector conditional select instruction (like ?:, but operating per-
 832 # component on vectors). There are two versions, one for floating point
 833 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 834
 835
 836 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 837
 838 # 3 way min/max/med
 839 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 840 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 841 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 842
 843 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 844 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 845 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 846
 847 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 848 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 849 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 850
 851 opcode("bcsel", 0, tuint, [0, 0, 0],
 852       [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 853 opcode("b32csel", 0, tuint, [0, 0, 0],
 854        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 855
 856 # SM5 bfi assembly
 857 triop("bfi", tuint32, "", """
 858 unsigned mask = src0, insert = src1, base = src2;
 859 if (mask == 0) {
 860    dst = base;
 861 } else {
 862    unsigned tmp = mask;
 863    while (!(tmp & 1)) {
 864       tmp >>= 1;
 865       insert <<= 1;
 866    }
 867    dst = (base & ~mask) | (insert & mask);
 868 }
 869 """)
 870
 871
 872 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 873
 874 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 875 opcode("ubfe", 0, tuint32,
 876        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 877 unsigned base = src0;
 878 unsigned offset = src1 & 0x1F;
 879 unsigned bits = src2 & 0x1F;
 880 if (bits == 0) {
 881    dst = 0;
 882 } else if (offset + bits < 32) {
 883    dst = (base << (32 - bits - offset)) >> (32 - bits);
 884 } else {
 885    dst = base >> offset;
 886 }
 887 """)
 888 opcode("ibfe", 0, tint32,
 889        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
 890 int base = src0;
 891 unsigned offset = src1 & 0x1F;
 892 unsigned bits = src2 & 0x1F;
 893 if (bits == 0) {
 894    dst = 0;
 895 } else if (offset + bits < 32) {
 896    dst = (base << (32 - bits - offset)) >> (32 - bits);
 897 } else {
 898    dst = base >> offset;
 899 }
 900 """)
 901
 902 # GLSL bitfieldExtract()
 903 opcode("ubitfield_extract", 0, tuint32,
 904        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
 905 unsigned base = src0;
 906 int offset = src1, bits = src2;
 907 if (bits == 0) {
 908    dst = 0;
 909 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 910    dst = 0; /* undefined per the spec */
 911 } else {
 912    dst = (base >> offset) & ((1ull << bits) - 1);
 913 }
 914 """)
 915 opcode("ibitfield_extract", 0, tint32,
 916        [0, 0, 0], [tint32, tint32, tint32], False, "", """
 917 int base = src0;
 918 int offset = src1, bits = src2;
 919 if (bits == 0) {
 920    dst = 0;
 921 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 922    dst = 0;
 923 } else {
 924    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 925 }
 926 """)
 927
 928 # Combines the first component of each input to make a 3-component vector.
 929
 930 triop_horiz("vec3", 3, 1, 1, 1, """
 931 dst.x = src0.x;
 932 dst.y = src1.x;
 933 dst.z = src2.x;
 934 """)
 935
 936 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 937                  src4_size, const_expr):
 938    opcode(name, output_size, tuint,
 939           [src1_size, src2_size, src3_size, src4_size],
 940           [tuint, tuint, tuint, tuint],
 941           False, "", const_expr)
 942
 943 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 944        [tuint32, tuint32, tint32, tint32], False, "", """
 945 unsigned base = src0, insert = src1;
 946 int offset = src2, bits = src3;
 947 if (bits == 0) {
 948    dst = base;
 949 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 950    dst = 0;
 951 } else {
 952    unsigned mask = ((1ull << bits) - 1) << offset;
 953    dst = (base & ~mask) | ((insert << offset) & mask);
 954 }
 955 """)
 956
 957 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 958 dst.x = src0.x;
 959 dst.y = src1.x;
 960 dst.z = src2.x;
 961 dst.w = src3.x;
 962 """)
 963
 964 # ir3-specific instruction that maps directly to mul-add shift high mix,
 965 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
 966 # multiplication (imul) on Freedreno backend..
 967 opcode("imadsh_mix16", 1, tint32,
 968        [1, 1, 1], [tint32, tint32, tint32], False, "", """
 969 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
 970 """)