src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool8 = "bool8"
  99 tbool16 = "bool16"
 100 tbool32 = "bool32"
 101 tuint = "uint"
 102 tuint16 = "uint16"
 103 tfloat32 = "float32"
 104 tint32 = "int32"
 105 tuint32 = "uint32"
 106 tint64 = "int64"
 107 tuint64 = "uint64"
 108 tfloat64 = "float64"
 109
 110 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 111
 112 def type_has_size(type_):
 113     m = _TYPE_SPLIT_RE.match(type_)
 114     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 115     return m.group('bits') is not None
 116
 117 def type_size(type_):
 118     m = _TYPE_SPLIT_RE.match(type_)
 119     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 120     assert m.group('bits') is not None, \
 121            'NIR type string has no bit size: "{}"'.format(type_)
 122     return int(m.group('bits'))
 123
 124 def type_sizes(type_):
 125     if type_has_size(type_):
 126         return [type_size(type_)]
 127     elif type_ == 'bool':
 128         return [1, 8, 16, 32]
 129     elif type_ == 'float':
 130         return [16, 32, 64]
 131     else:
 132         return [1, 8, 16, 32, 64]
 133
 134 def type_base_type(type_):
 135     m = _TYPE_SPLIT_RE.match(type_)
 136     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 137     return m.group('type')
 138
 139 # Operation where the first two sources are commutative.
 140 #
 141 # For 2-source operations, this just mathematical commutativity.  Some
 142 # 3-source operations, like ffma, are only commutative in the first two
 143 # sources.
 144 _2src_commutative = "2src_commutative "
 145 associative = "associative "
 146
 147 # global dictionary of opcodes
 148 opcodes = {}
 149
 150 def opcode(name, output_size, output_type, input_sizes, input_types,
 151            is_conversion, algebraic_properties, const_expr):
 152    assert name not in opcodes
 153    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 154                           input_types, is_conversion, algebraic_properties,
 155                           const_expr)
 156
 157 def unop_convert(name, out_type, in_type, const_expr):
 158    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 159
 160 def unop(name, ty, const_expr):
 161    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 162
 163 def unop_horiz(name, output_size, output_type, input_size, input_type,
 164                const_expr):
 165    opcode(name, output_size, output_type, [input_size], [input_type],
 166           False, "", const_expr)
 167
 168 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 169                 reduce_expr, final_expr):
 170    def prereduce(src):
 171       return "(" + prereduce_expr.format(src=src) + ")"
 172    def final(src):
 173       return final_expr.format(src="(" + src + ")")
 174    def reduce_(src0, src1):
 175       return reduce_expr.format(src0=src0, src1=src1)
 176    src0 = prereduce("src0.x")
 177    src1 = prereduce("src0.y")
 178    src2 = prereduce("src0.z")
 179    src3 = prereduce("src0.w")
 180    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 181               final(reduce_(src0, src1)))
 182    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 183               final(reduce_(reduce_(src0, src1), src2)))
 184    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 185               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 186
 187 def unop_numeric_convert(name, out_type, in_type, const_expr):
 188    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 189
 190 unop("mov", tuint, "src0")
 191
 192 unop("ineg", tint, "-src0")
 193 unop("fneg", tfloat, "-src0")
 194 unop("inot", tint, "~src0") # invert every bit of the integer
 195 unop("fsign", tfloat, ("bit_size == 64 ? " +
 196                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 197                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 198 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 199 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 200 unop("fabs", tfloat, "fabs(src0)")
 201 unop("fsat", tfloat, ("bit_size == 64 ? " +
 202                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 203                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 204 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 205 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 206 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 207 unop("fexp2", tfloat, "exp2f(src0)")
 208 unop("flog2", tfloat, "log2f(src0)")
 209
 210 # Generate all of the numeric conversion opcodes
 211 for src_t in [tint, tuint, tfloat, tbool]:
 212    if src_t == tbool:
 213       dst_types = [tfloat, tint]
 214    elif src_t == tint:
 215       dst_types = [tfloat, tint, tbool]
 216    elif src_t == tuint:
 217       dst_types = [tfloat, tuint]
 218    elif src_t == tfloat:
 219       dst_types = [tint, tuint, tfloat, tbool]
 220
 221    for dst_t in dst_types:
 222       for dst_bit_size in type_sizes(dst_t):
 223           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 224               rnd_modes = ['_rtne', '_rtz', '']
 225               for rnd_mode in rnd_modes:
 226                   if rnd_mode == '_rtne':
 227                       conv_expr = """
 228                       if (bit_size > 16) {
 229                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 230                       } else {
 231                          dst = src0;
 232                       }
 233                       """
 234                   elif rnd_mode == '_rtz':
 235                       conv_expr = """
 236                       if (bit_size > 16) {
 237                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 238                       } else {
 239                          dst = src0;
 240                       }
 241                       """
 242                   else:
 243                       conv_expr = "src0"
 244
 245                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 246                                                               dst_t[0],
 247                                                               dst_bit_size,
 248                                                               rnd_mode),
 249                                        dst_t + str(dst_bit_size),
 250                                        src_t, conv_expr)
 251           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 252               conv_expr = """
 253               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 254                  dst = _mesa_double_to_float_rtz(src0);
 255               } else {
 256                  dst = src0;
 257               }
 258               """
 259               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 260                                                        dst_bit_size),
 261                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 262           else:
 263               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 264               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 265                                                        dst_bit_size),
 266                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 267
 268
 269 # Unary floating-point rounding operations.
 270
 271
 272 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 273 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 274 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 275 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 276 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 277
 278 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 279
 280 # Trigonometric operations.
 281
 282
 283 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 284 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 285
 286 # dfrexp
 287 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 288 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 289
 290 # Partial derivatives.
 291
 292
 293 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 294 unop("fddy", tfloat, "0.0")
 295 unop("fddx_fine", tfloat, "0.0")
 296 unop("fddy_fine", tfloat, "0.0")
 297 unop("fddx_coarse", tfloat, "0.0")
 298 unop("fddy_coarse", tfloat, "0.0")
 299
 300
 301 # Floating point pack and unpack operations.
 302
 303 def pack_2x16(fmt):
 304    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 305 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 306 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 307 """.replace("fmt", fmt))
 308
 309 def pack_4x8(fmt):
 310    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 311 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 313 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 314 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 315 """.replace("fmt", fmt))
 316
 317 def unpack_2x16(fmt):
 318    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 319 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 320 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 321 """.replace("fmt", fmt))
 322
 323 def unpack_4x8(fmt):
 324    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 325 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 326 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 327 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 328 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 329 """.replace("fmt", fmt))
 330
 331
 332 pack_2x16("snorm")
 333 pack_4x8("snorm")
 334 pack_2x16("unorm")
 335 pack_4x8("unorm")
 336 pack_2x16("half")
 337 unpack_2x16("snorm")
 338 unpack_4x8("snorm")
 339 unpack_2x16("unorm")
 340 unpack_4x8("unorm")
 341 unpack_2x16("half")
 342
 343 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 344 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 345 """)
 346
 347 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 348 dst.x = (src0.x <<  0) |
 349         (src0.y <<  8) |
 350         (src0.z << 16) |
 351         (src0.w << 24);
 352 """)
 353
 354 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 355            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 356
 357 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 358            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 359
 360 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 361            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 362
 363 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 364            "dst.x = src0.x; dst.y = src0.x >> 32;")
 365
 366 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 367            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 368
 369 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 370            "dst.x = src0.x; dst.y = src0.x >> 16;")
 371
 372 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 373 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 374 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 375 """)
 376
 377 # Lowered floating point unpacking operations.
 378
 379 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 380              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 381 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 382              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 383
 384 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 385              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 386 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 387              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 388
 389 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 390 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 391
 392 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 393 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 394
 395 # Bit operations, part of ARB_gpu_shader5.
 396
 397
 398 unop("bitfield_reverse", tuint32, """
 399 /* we're not winning any awards for speed here, but that's ok */
 400 dst = 0;
 401 for (unsigned bit = 0; bit < 32; bit++)
 402    dst |= ((src0 >> bit) & 1) << (31 - bit);
 403 """)
 404 unop_convert("bit_count", tuint32, tuint, """
 405 dst = 0;
 406 for (unsigned bit = 0; bit < bit_size; bit++) {
 407    if ((src0 >> bit) & 1)
 408       dst++;
 409 }
 410 """)
 411
 412 unop_convert("ufind_msb", tint32, tuint, """
 413 dst = -1;
 414 for (int bit = bit_size - 1; bit >= 0; bit--) {
 415    if ((src0 >> bit) & 1) {
 416       dst = bit;
 417       break;
 418    }
 419 }
 420 """)
 421
 422 unop("uclz", tuint32, """
 423 int bit;
 424 for (bit = bit_size - 1; bit >= 0; bit--) {
 425    if ((src0 & (1u << bit)) != 0)
 426       break;
 427 }
 428 dst = (unsigned)(31 - bit);
 429 """)
 430
 431 unop("ifind_msb", tint32, """
 432 dst = -1;
 433 for (int bit = 31; bit >= 0; bit--) {
 434    /* If src0 < 0, we're looking for the first 0 bit.
 435     * if src0 >= 0, we're looking for the first 1 bit.
 436     */
 437    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 438       (!((src0 >> bit) & 1) && (src0 < 0))) {
 439       dst = bit;
 440       break;
 441    }
 442 }
 443 """)
 444
 445 unop_convert("find_lsb", tint32, tint, """
 446 dst = -1;
 447 for (unsigned bit = 0; bit < bit_size; bit++) {
 448    if ((src0 >> bit) & 1) {
 449       dst = bit;
 450       break;
 451    }
 452 }
 453 """)
 454
 455
 456 for i in range(1, 5):
 457    for j in range(1, 5):
 458       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 459
 460
 461 # AMD_gcn_shader extended instructions
 462 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 463 dst.x = dst.y = 0.0;
 464 float absX = fabs(src0.x);
 465 float absY = fabs(src0.y);
 466 float absZ = fabs(src0.z);
 467
 468 float ma = 0.0;
 469 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 470 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 471 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 472
 473 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 474 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 475 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 476 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 477 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 478 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 479
 480 dst.x = dst.x / ma + 0.5;
 481 dst.y = dst.y / ma + 0.5;
 482 """)
 483
 484 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 485 float absX = fabs(src0.x);
 486 float absY = fabs(src0.y);
 487 float absZ = fabs(src0.z);
 488 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 489 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 490 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 491 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 492 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 493 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 494 """)
 495
 496 # Sum of vector components
 497 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 498
 499 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 500    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 501           False, alg_props, const_expr)
 502
 503 def binop(name, ty, alg_props, const_expr):
 504    binop_convert(name, ty, ty, alg_props, const_expr)
 505
 506 def binop_compare(name, ty, alg_props, const_expr):
 507    binop_convert(name, tbool1, ty, alg_props, const_expr)
 508
 509 def binop_compare8(name, ty, alg_props, const_expr):
 510    binop_convert(name, tbool8, ty, alg_props, const_expr)
 511
 512 def binop_compare16(name, ty, alg_props, const_expr):
 513    binop_convert(name, tbool16, ty, alg_props, const_expr)
 514
 515 def binop_compare32(name, ty, alg_props, const_expr):
 516    binop_convert(name, tbool32, ty, alg_props, const_expr)
 517
 518 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 519    binop_compare(name, ty, alg_props, const_expr)
 520    binop_compare8(name + "8", ty, alg_props, const_expr)
 521    binop_compare16(name + "16", ty, alg_props, const_expr)
 522    binop_compare32(name + "32", ty, alg_props, const_expr)
 523
 524 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 525                 src2_type, const_expr):
 526    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 527           False, "", const_expr)
 528
 529 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 530                  reduce_expr, final_expr):
 531    def final(src):
 532       return final_expr.format(src= "(" + src + ")")
 533    def reduce_(src0, src1):
 534       return reduce_expr.format(src0=src0, src1=src1)
 535    def prereduce(src0, src1):
 536       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 537    src0 = prereduce("src0.x", "src1.x")
 538    src1 = prereduce("src0.y", "src1.y")
 539    src2 = prereduce("src0.z", "src1.z")
 540    src3 = prereduce("src0.w", "src1.w")
 541    opcode(name + "2", output_size, output_type,
 542           [2, 2], [src_type, src_type], False, _2src_commutative,
 543           final(reduce_(src0, src1)))
 544    opcode(name + "3", output_size, output_type,
 545           [3, 3], [src_type, src_type], False, _2src_commutative,
 546           final(reduce_(reduce_(src0, src1), src2)))
 547    opcode(name + "4", output_size, output_type,
 548           [4, 4], [src_type, src_type], False, _2src_commutative,
 549           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 550
 551 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 552                            reduce_expr, final_expr):
 553    binop_reduce(name, output_size, tbool1, src_type,
 554                 prereduce_expr, reduce_expr, final_expr)
 555    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
 556                 prereduce_expr, reduce_expr, final_expr)
 557    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 558                 prereduce_expr, reduce_expr, final_expr)
 559    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 560                 prereduce_expr, reduce_expr, final_expr)
 561
 562 binop("fadd", tfloat, _2src_commutative + associative,"""
 563 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 564    if (bit_size == 64)
 565       dst = _mesa_double_add_rtz(src0, src1);
 566    else
 567       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 568 } else {
 569    dst = src0 + src1;
 570 }
 571 """)
 572 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 573 binop("iadd_sat", tint, _2src_commutative, """
 574       src1 > 0 ?
 575          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 576          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 577 """)
 578 binop("uadd_sat", tuint, _2src_commutative,
 579       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 580 binop("isub_sat", tint, "", """
 581       src1 < 0 ?
 582          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 583          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 584 """)
 585 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 586
 587 binop("fsub", tfloat, "", """
 588 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 589    if (bit_size == 64)
 590       dst = _mesa_double_sub_rtz(src0, src1);
 591    else
 592       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 593 } else {
 594    dst = src0 - src1;
 595 }
 596 """)
 597 binop("isub", tint, "", "src0 - src1")
 598 binop_convert("uabs_isub", tuint, tint, "", """
 599               src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
 600                           : (uint64_t) src0 - (uint64_t) src1
 601 """)
 602 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
 603
 604 binop("fmul", tfloat, _2src_commutative + associative, """
 605 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 606    if (bit_size == 64)
 607       dst = _mesa_double_mul_rtz(src0, src1);
 608    else
 609       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 610 } else {
 611    dst = src0 * src1;
 612 }
 613 """)
 614 # low 32-bits of signed/unsigned integer multiply
 615 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 616
 617 # Generate 64 bit result from 2 32 bits quantity
 618 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 619               "(int64_t)src0 * (int64_t)src1")
 620 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 621               "(uint64_t)src0 * (uint64_t)src1")
 622
 623 # high 32-bits of signed integer multiply
 624 binop("imul_high", tint, _2src_commutative, """
 625 if (bit_size == 64) {
 626    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 627     * extension to work properly.  The casts are kind-of annoying but needed
 628     * to prevent compiler warnings.
 629     */
 630    uint32_t src0_u32[4] = {
 631       src0,
 632       (int64_t)src0 >> 32,
 633       (int64_t)src0 >> 63,
 634       (int64_t)src0 >> 63,
 635    };
 636    uint32_t src1_u32[4] = {
 637       src1,
 638       (int64_t)src1 >> 32,
 639       (int64_t)src1 >> 63,
 640       (int64_t)src1 >> 63,
 641    };
 642    uint32_t prod_u32[4];
 643    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 644    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 645 } else {
 646    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 647 }
 648 """)
 649
 650 # high 32-bits of unsigned integer multiply
 651 binop("umul_high", tuint, _2src_commutative, """
 652 if (bit_size == 64) {
 653    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 654    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 655    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 656    uint32_t prod_u32[4];
 657    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 658    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 659 } else {
 660    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 661 }
 662 """)
 663
 664 # low 32-bits of unsigned integer multiply
 665 binop("umul_low", tuint32, _2src_commutative, """
 666 uint64_t mask = (1 << (bit_size / 2)) - 1;
 667 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 668 """)
 669
 670 # Multiply 32-bits with low 16-bits.
 671 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
 672 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
 673
 674 binop("fdiv", tfloat, "", "src0 / src1")
 675 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 676 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 677
 678 # returns a boolean representing the carry resulting from the addition of
 679 # the two unsigned arguments.
 680
 681 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 682
 683 # returns a boolean representing the borrow resulting from the subtraction
 684 # of the two unsigned arguments.
 685
 686 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 687
 688 # hadd: (a + b) >> 1 (without overflow)
 689 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 690 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 691 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 692 #       =     ((x & y) << 1) + (x ^ y)
 693 #
 694 # Since we know that the bottom bit of (x & y) << 1 is zero,
 695 #
 696 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 697 #              =   (x & y) +      ((x ^ y)  >> 1)
 698 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 699 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 700
 701 # rhadd: (a + b + 1) >> 1 (without overflow)
 702 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 703 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 704 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 705 #           =     ((x | y) << 1) - (x ^ y) + 1
 706 #
 707 # Since we know that the bottom bit of (x & y) << 1 is zero,
 708 #
 709 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 710 #                  = (x | y) -  ((x ^ y)      >> 1)
 711 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 712 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 713
 714 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 715
 716 # For signed integers, there are several different possible definitions of
 717 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 718 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 719 # operation while the imod opcode implements the more mathematical
 720 # "modulus" operation.  For details on the difference, see
 721 #
 722 # http://mathforum.org/library/drmath/view/52343.html
 723
 724 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 725 binop("imod", tint, "",
 726       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 727       "                 src0 % src1 : src0 % src1 + src1)")
 728 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 729 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 730
 731 #
 732 # Comparisons
 733 #
 734
 735
 736 # these integer-aware comparisons return a boolean (0 or ~0)
 737
 738 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 739 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 740 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 741 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 742 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 743 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 744 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 745 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 746 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 747 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 748
 749 # integer-aware GLSL-style comparisons that compare floats and ints
 750
 751 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 752                        "{src0} && {src1}", "{src}")
 753 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 754                        "{src0} || {src1}", "{src}")
 755 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 756                        "{src0} && {src1}", "{src}")
 757 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 758                        "{src0} || {src1}", "{src}")
 759
 760 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 761
 762 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 763              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 764 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 765              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 766
 767 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 768 # and false respectively
 769
 770 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 771 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 772 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 773 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 774
 775 # SPIRV shifts are undefined for shift-operands >= bitsize,
 776 # but SM5 shifts are defined to use the least significant bits, only
 777 # The NIR definition is according to the SM5 specification.
 778 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 779        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 780 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 781        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 782 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 783        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 784
 785 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 786    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 787    dst = (src0 << (src1 & rotate_mask)) |
 788          (src0 >> (-src1 & rotate_mask));
 789 """)
 790 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 791    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 792    dst = (src0 >> (src1 & rotate_mask)) |
 793          (src0 << (-src1 & rotate_mask));
 794 """)
 795
 796 # bitwise logic operators
 797 #
 798 # These are also used as boolean and, or, xor for hardware supporting
 799 # integers.
 800
 801
 802 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 803 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 804 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 805
 806
 807 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 808              "{src}")
 809
 810 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 811              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 812
 813 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 814        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 815 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 816        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 817
 818 binop("fmin", tfloat, "", "fmin(src0, src1)")
 819 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 820 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 821 binop("fmax", tfloat, "", "fmax(src0, src1)")
 822 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 823 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 824
 825 # Saturated vector add for 4 8bit ints.
 826 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 827 dst = 0;
 828 for (int i = 0; i < 32; i += 8) {
 829    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 830 }
 831 """)
 832
 833 # Saturated vector subtract for 4 8bit ints.
 834 binop("ussub_4x8", tint32, "", """
 835 dst = 0;
 836 for (int i = 0; i < 32; i += 8) {
 837    int src0_chan = (src0 >> i) & 0xff;
 838    int src1_chan = (src1 >> i) & 0xff;
 839    if (src0_chan > src1_chan)
 840       dst |= (src0_chan - src1_chan) << i;
 841 }
 842 """)
 843
 844 # vector min for 4 8bit ints.
 845 binop("umin_4x8", tint32, _2src_commutative + associative, """
 846 dst = 0;
 847 for (int i = 0; i < 32; i += 8) {
 848    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 849 }
 850 """)
 851
 852 # vector max for 4 8bit ints.
 853 binop("umax_4x8", tint32, _2src_commutative + associative, """
 854 dst = 0;
 855 for (int i = 0; i < 32; i += 8) {
 856    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 857 }
 858 """)
 859
 860 # unorm multiply: (a * b) / 255.
 861 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 862 dst = 0;
 863 for (int i = 0; i < 32; i += 8) {
 864    int src0_chan = (src0 >> i) & 0xff;
 865    int src1_chan = (src1 >> i) & 0xff;
 866    dst |= ((src0_chan * src1_chan) / 255) << i;
 867 }
 868 """)
 869
 870 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 871
 872 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 873             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 874
 875 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 876               "src0 | ((uint64_t)src1 << 32)")
 877
 878 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 879               "src0 | ((uint32_t)src1 << 16)")
 880
 881 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 882 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 883 # are from the low five bits of src0 and src1, respectively.
 884 binop_convert("bfm", tuint32, tint32, "", """
 885 int bits = src0 & 0x1F;
 886 int offset = src1 & 0x1F;
 887 dst = ((1u << bits) - 1) << offset;
 888 """)
 889
 890 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 891 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 892 /* flush denormals to zero. */
 893 if (!isnormal(dst))
 894    dst = copysignf(0.0f, src0);
 895 """)
 896
 897 # Combines the first component of each input to make a 2-component vector.
 898
 899 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 900 dst.x = src0.x;
 901 dst.y = src1.x;
 902 """)
 903
 904 # Byte extraction
 905 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 906 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 907
 908 # Word extraction
 909 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 910 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 911
 912
 913 def triop(name, ty, alg_props, const_expr):
 914    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 915 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 916    opcode(name, output_size, tuint,
 917    [src1_size, src2_size, src3_size],
 918    [tuint, tuint, tuint], False, "", const_expr)
 919
 920 triop("ffma", tfloat, _2src_commutative, """
 921 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 922    if (bit_size == 64)
 923       dst = _mesa_double_fma_rtz(src0, src1, src2);
 924    else if (bit_size == 32)
 925       dst = _mesa_float_fma_rtz(src0, src1, src2);
 926    else
 927       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 928 } else {
 929    if (bit_size == 32)
 930       dst = fmaf(src0, src1, src2);
 931    else
 932       dst = fma(src0, src1, src2);
 933 }
 934 """)
 935
 936 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 937
 938 # Conditional Select
 939 #
 940 # A vector conditional select instruction (like ?:, but operating per-
 941 # component on vectors). There are two versions, one for floating point
 942 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 943
 944
 945 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 946
 947 # 3 way min/max/med
 948 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 949 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 950 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 951
 952 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 953 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 954 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 955
 956 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 957 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 958 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 959
 960 opcode("bcsel", 0, tuint, [0, 0, 0],
 961        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 962 opcode("b8csel", 0, tuint, [0, 0, 0],
 963        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
 964 opcode("b16csel", 0, tuint, [0, 0, 0],
 965        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 966 opcode("b32csel", 0, tuint, [0, 0, 0],
 967        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 968
 969 # SM5 bfi assembly
 970 triop("bfi", tuint32, "", """
 971 unsigned mask = src0, insert = src1, base = src2;
 972 if (mask == 0) {
 973    dst = base;
 974 } else {
 975    unsigned tmp = mask;
 976    while (!(tmp & 1)) {
 977       tmp >>= 1;
 978       insert <<= 1;
 979    }
 980    dst = (base & ~mask) | (insert & mask);
 981 }
 982 """)
 983
 984
 985 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 986
 987 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 988 opcode("ubfe", 0, tuint32,
 989        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 990 unsigned base = src0;
 991 unsigned offset = src1 & 0x1F;
 992 unsigned bits = src2 & 0x1F;
 993 if (bits == 0) {
 994    dst = 0;
 995 } else if (offset + bits < 32) {
 996    dst = (base << (32 - bits - offset)) >> (32 - bits);
 997 } else {
 998    dst = base >> offset;
 999 }
1000 """)
1001 opcode("ibfe", 0, tint32,
1002        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1003 int base = src0;
1004 unsigned offset = src1 & 0x1F;
1005 unsigned bits = src2 & 0x1F;
1006 if (bits == 0) {
1007    dst = 0;
1008 } else if (offset + bits < 32) {
1009    dst = (base << (32 - bits - offset)) >> (32 - bits);
1010 } else {
1011    dst = base >> offset;
1012 }
1013 """)
1014
1015 # GLSL bitfieldExtract()
1016 opcode("ubitfield_extract", 0, tuint32,
1017        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1018 unsigned base = src0;
1019 int offset = src1, bits = src2;
1020 if (bits == 0) {
1021    dst = 0;
1022 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1023    dst = 0; /* undefined per the spec */
1024 } else {
1025    dst = (base >> offset) & ((1ull << bits) - 1);
1026 }
1027 """)
1028 opcode("ibitfield_extract", 0, tint32,
1029        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1030 int base = src0;
1031 int offset = src1, bits = src2;
1032 if (bits == 0) {
1033    dst = 0;
1034 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1035    dst = 0;
1036 } else {
1037    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1038 }
1039 """)
1040
1041 # Combines the first component of each input to make a 3-component vector.
1042
1043 triop_horiz("vec3", 3, 1, 1, 1, """
1044 dst.x = src0.x;
1045 dst.y = src1.x;
1046 dst.z = src2.x;
1047 """)
1048
1049 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1050                  src4_size, const_expr):
1051    opcode(name, output_size, tuint,
1052           [src1_size, src2_size, src3_size, src4_size],
1053           [tuint, tuint, tuint, tuint],
1054           False, "", const_expr)
1055
1056 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1057        [tuint32, tuint32, tint32, tint32], False, "", """
1058 unsigned base = src0, insert = src1;
1059 int offset = src2, bits = src3;
1060 if (bits == 0) {
1061    dst = base;
1062 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1063    dst = 0;
1064 } else {
1065    unsigned mask = ((1ull << bits) - 1) << offset;
1066    dst = (base & ~mask) | ((insert << offset) & mask);
1067 }
1068 """)
1069
1070 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1071 dst.x = src0.x;
1072 dst.y = src1.x;
1073 dst.z = src2.x;
1074 dst.w = src3.x;
1075 """)
1076
1077 opcode("vec8", 8, tuint,
1078        [1] * 8, [tuint] * 8,
1079        False, "", """
1080 dst.x = src0.x;
1081 dst.y = src1.x;
1082 dst.z = src2.x;
1083 dst.w = src3.x;
1084 dst.e = src4.x;
1085 dst.f = src5.x;
1086 dst.g = src6.x;
1087 dst.h = src7.x;
1088 """)
1089
1090 opcode("vec16", 16, tuint,
1091        [1] * 16, [tuint] * 16,
1092        False, "", """
1093 dst.x = src0.x;
1094 dst.y = src1.x;
1095 dst.z = src2.x;
1096 dst.w = src3.x;
1097 dst.e = src4.x;
1098 dst.f = src5.x;
1099 dst.g = src6.x;
1100 dst.h = src7.x;
1101 dst.i = src8.x;
1102 dst.j = src9.x;
1103 dst.k = src10.x;
1104 dst.l = src11.x;
1105 dst.m = src12.x;
1106 dst.n = src13.x;
1107 dst.o = src14.x;
1108 dst.p = src15.x;
1109 """)
1110
1111 # An integer multiply instruction for address calculation.  This is
1112 # similar to imul, except that the results are undefined in case of
1113 # overflow.  Overflow is defined according to the size of the variable
1114 # being dereferenced.
1115 #
1116 # This relaxed definition, compared to imul, allows an optimization
1117 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1118 # sources, such that lower precision integer multiplies can be used.
1119 # This is useful on hw that has 24b or perhaps 16b integer multiply
1120 # instructions.
1121 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1122
1123 # ir3-specific instruction that maps directly to mul-add shift high mix,
1124 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1125 # multiplication (imul) on Freedreno backend..
1126 opcode("imadsh_mix16", 1, tint32,
1127        [1, 1, 1], [tint32, tint32, tint32], False, "", """
1128 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1129 """)
1130
1131 # ir3-specific instruction that maps directly to ir3 mad.s24.
1132 #
1133 # 24b multiply into 32b result (with sign extension) plus 32b int
1134 triop("imad24_ir3", tint32, _2src_commutative,
1135       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1136
1137 # 24b multiply into 32b result (with sign extension)
1138 binop("imul24", tint32, _2src_commutative + associative,
1139       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")