src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool8 = "bool8"
  99 tbool16 = "bool16"
 100 tbool32 = "bool32"
 101 tuint = "uint"
 102 tuint16 = "uint16"
 103 tfloat16 = "float16"
 104 tfloat32 = "float32"
 105 tint32 = "int32"
 106 tuint32 = "uint32"
 107 tint64 = "int64"
 108 tuint64 = "uint64"
 109 tfloat64 = "float64"
 110
 111 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 112
 113 def type_has_size(type_):
 114     m = _TYPE_SPLIT_RE.match(type_)
 115     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 116     return m.group('bits') is not None
 117
 118 def type_size(type_):
 119     m = _TYPE_SPLIT_RE.match(type_)
 120     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 121     assert m.group('bits') is not None, \
 122            'NIR type string has no bit size: "{}"'.format(type_)
 123     return int(m.group('bits'))
 124
 125 def type_sizes(type_):
 126     if type_has_size(type_):
 127         return [type_size(type_)]
 128     elif type_ == 'bool':
 129         return [1, 8, 16, 32]
 130     elif type_ == 'float':
 131         return [16, 32, 64]
 132     else:
 133         return [1, 8, 16, 32, 64]
 134
 135 def type_base_type(type_):
 136     m = _TYPE_SPLIT_RE.match(type_)
 137     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 138     return m.group('type')
 139
 140 # Operation where the first two sources are commutative.
 141 #
 142 # For 2-source operations, this just mathematical commutativity.  Some
 143 # 3-source operations, like ffma, are only commutative in the first two
 144 # sources.
 145 _2src_commutative = "2src_commutative "
 146 associative = "associative "
 147
 148 # global dictionary of opcodes
 149 opcodes = {}
 150
 151 def opcode(name, output_size, output_type, input_sizes, input_types,
 152            is_conversion, algebraic_properties, const_expr):
 153    assert name not in opcodes
 154    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 155                           input_types, is_conversion, algebraic_properties,
 156                           const_expr)
 157
 158 def unop_convert(name, out_type, in_type, const_expr):
 159    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 160
 161 def unop(name, ty, const_expr):
 162    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 163
 164 def unop_horiz(name, output_size, output_type, input_size, input_type,
 165                const_expr):
 166    opcode(name, output_size, output_type, [input_size], [input_type],
 167           False, "", const_expr)
 168
 169 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 170                 reduce_expr, final_expr):
 171    def prereduce(src):
 172       return "(" + prereduce_expr.format(src=src) + ")"
 173    def final(src):
 174       return final_expr.format(src="(" + src + ")")
 175    def reduce_(src0, src1):
 176       return reduce_expr.format(src0=src0, src1=src1)
 177    src0 = prereduce("src0.x")
 178    src1 = prereduce("src0.y")
 179    src2 = prereduce("src0.z")
 180    src3 = prereduce("src0.w")
 181    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 182               final(reduce_(src0, src1)))
 183    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 184               final(reduce_(reduce_(src0, src1), src2)))
 185    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 186               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 187
 188 def unop_numeric_convert(name, out_type, in_type, const_expr):
 189    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 190
 191 unop("mov", tuint, "src0")
 192
 193 unop("ineg", tint, "-src0")
 194 unop("fneg", tfloat, "-src0")
 195 unop("inot", tint, "~src0") # invert every bit of the integer
 196 unop("fsign", tfloat, ("bit_size == 64 ? " +
 197                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 198                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 199 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 200 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 201 unop("fabs", tfloat, "fabs(src0)")
 202 unop("fsat", tfloat, ("bit_size == 64 ? " +
 203                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 204                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 205 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 206 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 207 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 208 unop("fexp2", tfloat, "exp2f(src0)")
 209 unop("flog2", tfloat, "log2f(src0)")
 210
 211 # Generate all of the numeric conversion opcodes
 212 for src_t in [tint, tuint, tfloat, tbool]:
 213    if src_t == tbool:
 214       dst_types = [tfloat, tint, tbool]
 215    elif src_t == tint:
 216       dst_types = [tfloat, tint, tbool]
 217    elif src_t == tuint:
 218       dst_types = [tfloat, tuint]
 219    elif src_t == tfloat:
 220       dst_types = [tint, tuint, tfloat, tbool]
 221
 222    for dst_t in dst_types:
 223       for dst_bit_size in type_sizes(dst_t):
 224           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 225               rnd_modes = ['_rtne', '_rtz', '']
 226               for rnd_mode in rnd_modes:
 227                   if rnd_mode == '_rtne':
 228                       conv_expr = """
 229                       if (bit_size > 16) {
 230                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 231                       } else {
 232                          dst = src0;
 233                       }
 234                       """
 235                   elif rnd_mode == '_rtz':
 236                       conv_expr = """
 237                       if (bit_size > 16) {
 238                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 239                       } else {
 240                          dst = src0;
 241                       }
 242                       """
 243                   else:
 244                       conv_expr = "src0"
 245
 246                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 247                                                               dst_t[0],
 248                                                               dst_bit_size,
 249                                                               rnd_mode),
 250                                        dst_t + str(dst_bit_size),
 251                                        src_t, conv_expr)
 252           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 253               conv_expr = """
 254               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 255                  dst = _mesa_double_to_float_rtz(src0);
 256               } else {
 257                  dst = src0;
 258               }
 259               """
 260               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 261                                                        dst_bit_size),
 262                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 263           else:
 264               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 265               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 266                                                        dst_bit_size),
 267                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 268
 269 # Special opcode that is the same as f2f16 except that it is safe to remove it
 270 # if the result is immediately converted back to float32 again. This is
 271 # generated as part of the precision lowering pass. mp stands for medium
 272 # precision.
 273 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
 274
 275 # Unary floating-point rounding operations.
 276
 277
 278 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 279 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 280 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 281 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 282 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 283
 284 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 285
 286 # Trigonometric operations.
 287
 288
 289 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 290 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 291
 292 # dfrexp
 293 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 294 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 295
 296 # Partial derivatives.
 297
 298
 299 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 300 unop("fddy", tfloat, "0.0")
 301 unop("fddx_fine", tfloat, "0.0")
 302 unop("fddy_fine", tfloat, "0.0")
 303 unop("fddx_coarse", tfloat, "0.0")
 304 unop("fddy_coarse", tfloat, "0.0")
 305
 306
 307 # Floating point pack and unpack operations.
 308
 309 def pack_2x16(fmt):
 310    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 311 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 312 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 313 """.replace("fmt", fmt))
 314
 315 def pack_4x8(fmt):
 316    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 317 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 318 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 319 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 320 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 321 """.replace("fmt", fmt))
 322
 323 def unpack_2x16(fmt):
 324    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 325 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 326 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 327 """.replace("fmt", fmt))
 328
 329 def unpack_4x8(fmt):
 330    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 331 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 332 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 333 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 334 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 335 """.replace("fmt", fmt))
 336
 337
 338 pack_2x16("snorm")
 339 pack_4x8("snorm")
 340 pack_2x16("unorm")
 341 pack_4x8("unorm")
 342 pack_2x16("half")
 343 unpack_2x16("snorm")
 344 unpack_4x8("snorm")
 345 unpack_2x16("unorm")
 346 unpack_4x8("unorm")
 347 unpack_2x16("half")
 348
 349 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 350 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 351 """)
 352
 353 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 354 dst.x = (src0.x <<  0) |
 355         (src0.y <<  8) |
 356         (src0.z << 16) |
 357         (src0.w << 24);
 358 """)
 359
 360 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 361            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 362
 363 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 364            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 365
 366 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 367            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 368
 369 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 370            "dst.x = src0.x; dst.y = src0.x >> 32;")
 371
 372 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 373            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 374
 375 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 376            "dst.x = src0.x; dst.y = src0.x >> 16;")
 377
 378 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 379 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 380 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 381 """)
 382
 383 # Lowered floating point unpacking operations.
 384
 385 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 386              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 387 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 388              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 389
 390 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 391              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 392 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 393              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 394
 395 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 396 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 397
 398 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 399 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 400
 401 # Bit operations, part of ARB_gpu_shader5.
 402
 403
 404 unop("bitfield_reverse", tuint32, """
 405 /* we're not winning any awards for speed here, but that's ok */
 406 dst = 0;
 407 for (unsigned bit = 0; bit < 32; bit++)
 408    dst |= ((src0 >> bit) & 1) << (31 - bit);
 409 """)
 410 unop_convert("bit_count", tuint32, tuint, """
 411 dst = 0;
 412 for (unsigned bit = 0; bit < bit_size; bit++) {
 413    if ((src0 >> bit) & 1)
 414       dst++;
 415 }
 416 """)
 417
 418 unop_convert("ufind_msb", tint32, tuint, """
 419 dst = -1;
 420 for (int bit = bit_size - 1; bit >= 0; bit--) {
 421    if ((src0 >> bit) & 1) {
 422       dst = bit;
 423       break;
 424    }
 425 }
 426 """)
 427
 428 unop("uclz", tuint32, """
 429 int bit;
 430 for (bit = bit_size - 1; bit >= 0; bit--) {
 431    if ((src0 & (1u << bit)) != 0)
 432       break;
 433 }
 434 dst = (unsigned)(31 - bit);
 435 """)
 436
 437 unop("ifind_msb", tint32, """
 438 dst = -1;
 439 for (int bit = 31; bit >= 0; bit--) {
 440    /* If src0 < 0, we're looking for the first 0 bit.
 441     * if src0 >= 0, we're looking for the first 1 bit.
 442     */
 443    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 444       (!((src0 >> bit) & 1) && (src0 < 0))) {
 445       dst = bit;
 446       break;
 447    }
 448 }
 449 """)
 450
 451 unop_convert("find_lsb", tint32, tint, """
 452 dst = -1;
 453 for (unsigned bit = 0; bit < bit_size; bit++) {
 454    if ((src0 >> bit) & 1) {
 455       dst = bit;
 456       break;
 457    }
 458 }
 459 """)
 460
 461
 462 for i in range(1, 5):
 463    for j in range(1, 5):
 464       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 465
 466
 467 # AMD_gcn_shader extended instructions
 468 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 469 dst.x = dst.y = 0.0;
 470 float absX = fabsf(src0.x);
 471 float absY = fabsf(src0.y);
 472 float absZ = fabsf(src0.z);
 473
 474 float ma = 0.0;
 475 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 476 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 477 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 478
 479 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 480 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 481 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 482 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 483 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 484 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 485
 486 dst.x = dst.x / ma + 0.5;
 487 dst.y = dst.y / ma + 0.5;
 488 """)
 489
 490 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 491 float absX = fabsf(src0.x);
 492 float absY = fabsf(src0.y);
 493 float absZ = fabsf(src0.z);
 494 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 495 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 496 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 497 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 498 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 499 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 500 """)
 501
 502 # Sum of vector components
 503 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 504
 505 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 506    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 507           False, alg_props, const_expr)
 508
 509 def binop(name, ty, alg_props, const_expr):
 510    binop_convert(name, ty, ty, alg_props, const_expr)
 511
 512 def binop_compare(name, ty, alg_props, const_expr):
 513    binop_convert(name, tbool1, ty, alg_props, const_expr)
 514
 515 def binop_compare8(name, ty, alg_props, const_expr):
 516    binop_convert(name, tbool8, ty, alg_props, const_expr)
 517
 518 def binop_compare16(name, ty, alg_props, const_expr):
 519    binop_convert(name, tbool16, ty, alg_props, const_expr)
 520
 521 def binop_compare32(name, ty, alg_props, const_expr):
 522    binop_convert(name, tbool32, ty, alg_props, const_expr)
 523
 524 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 525    binop_compare(name, ty, alg_props, const_expr)
 526    binop_compare8(name + "8", ty, alg_props, const_expr)
 527    binop_compare16(name + "16", ty, alg_props, const_expr)
 528    binop_compare32(name + "32", ty, alg_props, const_expr)
 529
 530 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 531                 src2_type, const_expr):
 532    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 533           False, "", const_expr)
 534
 535 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 536                  reduce_expr, final_expr):
 537    def final(src):
 538       return final_expr.format(src= "(" + src + ")")
 539    def reduce_(src0, src1):
 540       return reduce_expr.format(src0=src0, src1=src1)
 541    def prereduce(src0, src1):
 542       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 543    src0 = prereduce("src0.x", "src1.x")
 544    src1 = prereduce("src0.y", "src1.y")
 545    src2 = prereduce("src0.z", "src1.z")
 546    src3 = prereduce("src0.w", "src1.w")
 547    opcode(name + "2", output_size, output_type,
 548           [2, 2], [src_type, src_type], False, _2src_commutative,
 549           final(reduce_(src0, src1)))
 550    opcode(name + "3", output_size, output_type,
 551           [3, 3], [src_type, src_type], False, _2src_commutative,
 552           final(reduce_(reduce_(src0, src1), src2)))
 553    opcode(name + "4", output_size, output_type,
 554           [4, 4], [src_type, src_type], False, _2src_commutative,
 555           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 556
 557 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 558                            reduce_expr, final_expr):
 559    binop_reduce(name, output_size, tbool1, src_type,
 560                 prereduce_expr, reduce_expr, final_expr)
 561    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
 562                 prereduce_expr, reduce_expr, final_expr)
 563    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 564                 prereduce_expr, reduce_expr, final_expr)
 565    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 566                 prereduce_expr, reduce_expr, final_expr)
 567
 568 binop("fadd", tfloat, _2src_commutative + associative,"""
 569 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 570    if (bit_size == 64)
 571       dst = _mesa_double_add_rtz(src0, src1);
 572    else
 573       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 574 } else {
 575    dst = src0 + src1;
 576 }
 577 """)
 578 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 579 binop("iadd_sat", tint, _2src_commutative, """
 580       src1 > 0 ?
 581          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 582          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 583 """)
 584 binop("uadd_sat", tuint, _2src_commutative,
 585       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 586 binop("isub_sat", tint, "", """
 587       src1 < 0 ?
 588          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 589          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 590 """)
 591 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 592
 593 binop("fsub", tfloat, "", """
 594 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 595    if (bit_size == 64)
 596       dst = _mesa_double_sub_rtz(src0, src1);
 597    else
 598       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 599 } else {
 600    dst = src0 - src1;
 601 }
 602 """)
 603 binop("isub", tint, "", "src0 - src1")
 604 binop_convert("uabs_isub", tuint, tint, "", """
 605               src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
 606                           : (uint64_t) src0 - (uint64_t) src1
 607 """)
 608 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
 609
 610 binop("fmul", tfloat, _2src_commutative + associative, """
 611 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 612    if (bit_size == 64)
 613       dst = _mesa_double_mul_rtz(src0, src1);
 614    else
 615       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 616 } else {
 617    dst = src0 * src1;
 618 }
 619 """)
 620 # low 32-bits of signed/unsigned integer multiply
 621 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 622
 623 # Generate 64 bit result from 2 32 bits quantity
 624 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 625               "(int64_t)src0 * (int64_t)src1")
 626 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 627               "(uint64_t)src0 * (uint64_t)src1")
 628
 629 # high 32-bits of signed integer multiply
 630 binop("imul_high", tint, _2src_commutative, """
 631 if (bit_size == 64) {
 632    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 633     * extension to work properly.  The casts are kind-of annoying but needed
 634     * to prevent compiler warnings.
 635     */
 636    uint32_t src0_u32[4] = {
 637       src0,
 638       (int64_t)src0 >> 32,
 639       (int64_t)src0 >> 63,
 640       (int64_t)src0 >> 63,
 641    };
 642    uint32_t src1_u32[4] = {
 643       src1,
 644       (int64_t)src1 >> 32,
 645       (int64_t)src1 >> 63,
 646       (int64_t)src1 >> 63,
 647    };
 648    uint32_t prod_u32[4];
 649    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 650    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 651 } else {
 652    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 653 }
 654 """)
 655
 656 # high 32-bits of unsigned integer multiply
 657 binop("umul_high", tuint, _2src_commutative, """
 658 if (bit_size == 64) {
 659    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 660    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 661    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 662    uint32_t prod_u32[4];
 663    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 664    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 665 } else {
 666    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 667 }
 668 """)
 669
 670 # low 32-bits of unsigned integer multiply
 671 binop("umul_low", tuint32, _2src_commutative, """
 672 uint64_t mask = (1 << (bit_size / 2)) - 1;
 673 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 674 """)
 675
 676 # Multiply 32-bits with low 16-bits.
 677 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
 678 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
 679
 680 binop("fdiv", tfloat, "", "src0 / src1")
 681 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 682 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 683
 684 # returns a boolean representing the carry resulting from the addition of
 685 # the two unsigned arguments.
 686
 687 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 688
 689 # returns a boolean representing the borrow resulting from the subtraction
 690 # of the two unsigned arguments.
 691
 692 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 693
 694 # hadd: (a + b) >> 1 (without overflow)
 695 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 696 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 697 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 698 #       =     ((x & y) << 1) + (x ^ y)
 699 #
 700 # Since we know that the bottom bit of (x & y) << 1 is zero,
 701 #
 702 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 703 #              =   (x & y) +      ((x ^ y)  >> 1)
 704 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 705 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 706
 707 # rhadd: (a + b + 1) >> 1 (without overflow)
 708 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 709 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 710 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 711 #           =     ((x | y) << 1) - (x ^ y) + 1
 712 #
 713 # Since we know that the bottom bit of (x & y) << 1 is zero,
 714 #
 715 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 716 #                  = (x | y) -  ((x ^ y)      >> 1)
 717 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 718 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 719
 720 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 721
 722 # For signed integers, there are several different possible definitions of
 723 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 724 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 725 # operation while the imod opcode implements the more mathematical
 726 # "modulus" operation.  For details on the difference, see
 727 #
 728 # http://mathforum.org/library/drmath/view/52343.html
 729
 730 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 731 binop("imod", tint, "",
 732       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 733       "                 src0 % src1 : src0 % src1 + src1)")
 734 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 735 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 736
 737 #
 738 # Comparisons
 739 #
 740
 741
 742 # these integer-aware comparisons return a boolean (0 or ~0)
 743
 744 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 745 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 746 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 747 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 748 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 749 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 750 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 751 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 752 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 753 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 754
 755 # integer-aware GLSL-style comparisons that compare floats and ints
 756
 757 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 758                        "{src0} && {src1}", "{src}")
 759 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 760                        "{src0} || {src1}", "{src}")
 761 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 762                        "{src0} && {src1}", "{src}")
 763 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 764                        "{src0} || {src1}", "{src}")
 765
 766 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 767
 768 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 769              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 770 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 771              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 772
 773 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 774 # and false respectively
 775
 776 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 777 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 778 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 779 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 780
 781 # SPIRV shifts are undefined for shift-operands >= bitsize,
 782 # but SM5 shifts are defined to use the least significant bits, only
 783 # The NIR definition is according to the SM5 specification.
 784 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 785        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 786 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 787        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 788 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 789        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 790
 791 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 792    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 793    dst = (src0 << (src1 & rotate_mask)) |
 794          (src0 >> (-src1 & rotate_mask));
 795 """)
 796 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 797    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 798    dst = (src0 >> (src1 & rotate_mask)) |
 799          (src0 << (-src1 & rotate_mask));
 800 """)
 801
 802 # bitwise logic operators
 803 #
 804 # These are also used as boolean and, or, xor for hardware supporting
 805 # integers.
 806
 807
 808 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 809 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 810 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 811
 812
 813 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 814              "{src}")
 815
 816 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 817              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 818
 819 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 820        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 821 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 822        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 823
 824 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
 825 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 826 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 827 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
 828 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 829 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 830
 831 # Saturated vector add for 4 8bit ints.
 832 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 833 dst = 0;
 834 for (int i = 0; i < 32; i += 8) {
 835    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 836 }
 837 """)
 838
 839 # Saturated vector subtract for 4 8bit ints.
 840 binop("ussub_4x8", tint32, "", """
 841 dst = 0;
 842 for (int i = 0; i < 32; i += 8) {
 843    int src0_chan = (src0 >> i) & 0xff;
 844    int src1_chan = (src1 >> i) & 0xff;
 845    if (src0_chan > src1_chan)
 846       dst |= (src0_chan - src1_chan) << i;
 847 }
 848 """)
 849
 850 # vector min for 4 8bit ints.
 851 binop("umin_4x8", tint32, _2src_commutative + associative, """
 852 dst = 0;
 853 for (int i = 0; i < 32; i += 8) {
 854    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 855 }
 856 """)
 857
 858 # vector max for 4 8bit ints.
 859 binop("umax_4x8", tint32, _2src_commutative + associative, """
 860 dst = 0;
 861 for (int i = 0; i < 32; i += 8) {
 862    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 863 }
 864 """)
 865
 866 # unorm multiply: (a * b) / 255.
 867 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 868 dst = 0;
 869 for (int i = 0; i < 32; i += 8) {
 870    int src0_chan = (src0 >> i) & 0xff;
 871    int src1_chan = (src1 >> i) & 0xff;
 872    dst |= ((src0_chan * src1_chan) / 255) << i;
 873 }
 874 """)
 875
 876 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 877
 878 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 879             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 880
 881 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 882               "src0 | ((uint64_t)src1 << 32)")
 883
 884 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 885               "src0 | ((uint32_t)src1 << 16)")
 886
 887 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 888 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 889 # are from the low five bits of src0 and src1, respectively.
 890 binop_convert("bfm", tuint32, tint32, "", """
 891 int bits = src0 & 0x1F;
 892 int offset = src1 & 0x1F;
 893 dst = ((1u << bits) - 1) << offset;
 894 """)
 895
 896 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 897 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 898 /* flush denormals to zero. */
 899 if (!isnormal(dst))
 900    dst = copysignf(0.0f, src0);
 901 """)
 902
 903 # Combines the first component of each input to make a 2-component vector.
 904
 905 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 906 dst.x = src0.x;
 907 dst.y = src1.x;
 908 """)
 909
 910 # Byte extraction
 911 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 912 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 913
 914 # Word extraction
 915 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 916 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 917
 918
 919 def triop(name, ty, alg_props, const_expr):
 920    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 921 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 922    opcode(name, output_size, tuint,
 923    [src1_size, src2_size, src3_size],
 924    [tuint, tuint, tuint], False, "", const_expr)
 925
 926 triop("ffma", tfloat, _2src_commutative, """
 927 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 928    if (bit_size == 64)
 929       dst = _mesa_double_fma_rtz(src0, src1, src2);
 930    else if (bit_size == 32)
 931       dst = _mesa_float_fma_rtz(src0, src1, src2);
 932    else
 933       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 934 } else {
 935    if (bit_size == 32)
 936       dst = fmaf(src0, src1, src2);
 937    else
 938       dst = fma(src0, src1, src2);
 939 }
 940 """)
 941
 942 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 943
 944 # Conditional Select
 945 #
 946 # A vector conditional select instruction (like ?:, but operating per-
 947 # component on vectors). There are two versions, one for floating point
 948 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 949
 950
 951 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 952
 953 # 3 way min/max/med
 954 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 955 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 956 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 957
 958 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 959 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 960 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 961
 962 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 963 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 964 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 965
 966 opcode("bcsel", 0, tuint, [0, 0, 0],
 967        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 968 opcode("b8csel", 0, tuint, [0, 0, 0],
 969        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
 970 opcode("b16csel", 0, tuint, [0, 0, 0],
 971        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 972 opcode("b32csel", 0, tuint, [0, 0, 0],
 973        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 974
 975 # SM5 bfi assembly
 976 triop("bfi", tuint32, "", """
 977 unsigned mask = src0, insert = src1, base = src2;
 978 if (mask == 0) {
 979    dst = base;
 980 } else {
 981    unsigned tmp = mask;
 982    while (!(tmp & 1)) {
 983       tmp >>= 1;
 984       insert <<= 1;
 985    }
 986    dst = (base & ~mask) | (insert & mask);
 987 }
 988 """)
 989
 990
 991 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 992
 993 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 994 opcode("ubfe", 0, tuint32,
 995        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 996 unsigned base = src0;
 997 unsigned offset = src1 & 0x1F;
 998 unsigned bits = src2 & 0x1F;
 999 if (bits == 0) {
1000    dst = 0;
1001 } else if (offset + bits < 32) {
1002    dst = (base << (32 - bits - offset)) >> (32 - bits);
1003 } else {
1004    dst = base >> offset;
1005 }
1006 """)
1007 opcode("ibfe", 0, tint32,
1008        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1009 int base = src0;
1010 unsigned offset = src1 & 0x1F;
1011 unsigned bits = src2 & 0x1F;
1012 if (bits == 0) {
1013    dst = 0;
1014 } else if (offset + bits < 32) {
1015    dst = (base << (32 - bits - offset)) >> (32 - bits);
1016 } else {
1017    dst = base >> offset;
1018 }
1019 """)
1020
1021 # GLSL bitfieldExtract()
1022 opcode("ubitfield_extract", 0, tuint32,
1023        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1024 unsigned base = src0;
1025 int offset = src1, bits = src2;
1026 if (bits == 0) {
1027    dst = 0;
1028 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1029    dst = 0; /* undefined per the spec */
1030 } else {
1031    dst = (base >> offset) & ((1ull << bits) - 1);
1032 }
1033 """)
1034 opcode("ibitfield_extract", 0, tint32,
1035        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1036 int base = src0;
1037 int offset = src1, bits = src2;
1038 if (bits == 0) {
1039    dst = 0;
1040 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1041    dst = 0;
1042 } else {
1043    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1044 }
1045 """)
1046
1047 # Combines the first component of each input to make a 3-component vector.
1048
1049 triop_horiz("vec3", 3, 1, 1, 1, """
1050 dst.x = src0.x;
1051 dst.y = src1.x;
1052 dst.z = src2.x;
1053 """)
1054
1055 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1056                  src4_size, const_expr):
1057    opcode(name, output_size, tuint,
1058           [src1_size, src2_size, src3_size, src4_size],
1059           [tuint, tuint, tuint, tuint],
1060           False, "", const_expr)
1061
1062 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1063        [tuint32, tuint32, tint32, tint32], False, "", """
1064 unsigned base = src0, insert = src1;
1065 int offset = src2, bits = src3;
1066 if (bits == 0) {
1067    dst = base;
1068 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1069    dst = 0;
1070 } else {
1071    unsigned mask = ((1ull << bits) - 1) << offset;
1072    dst = (base & ~mask) | ((insert << offset) & mask);
1073 }
1074 """)
1075
1076 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1077 dst.x = src0.x;
1078 dst.y = src1.x;
1079 dst.z = src2.x;
1080 dst.w = src3.x;
1081 """)
1082
1083 opcode("vec8", 8, tuint,
1084        [1] * 8, [tuint] * 8,
1085        False, "", """
1086 dst.x = src0.x;
1087 dst.y = src1.x;
1088 dst.z = src2.x;
1089 dst.w = src3.x;
1090 dst.e = src4.x;
1091 dst.f = src5.x;
1092 dst.g = src6.x;
1093 dst.h = src7.x;
1094 """)
1095
1096 opcode("vec16", 16, tuint,
1097        [1] * 16, [tuint] * 16,
1098        False, "", """
1099 dst.x = src0.x;
1100 dst.y = src1.x;
1101 dst.z = src2.x;
1102 dst.w = src3.x;
1103 dst.e = src4.x;
1104 dst.f = src5.x;
1105 dst.g = src6.x;
1106 dst.h = src7.x;
1107 dst.i = src8.x;
1108 dst.j = src9.x;
1109 dst.k = src10.x;
1110 dst.l = src11.x;
1111 dst.m = src12.x;
1112 dst.n = src13.x;
1113 dst.o = src14.x;
1114 dst.p = src15.x;
1115 """)
1116
1117 # An integer multiply instruction for address calculation.  This is
1118 # similar to imul, except that the results are undefined in case of
1119 # overflow.  Overflow is defined according to the size of the variable
1120 # being dereferenced.
1121 #
1122 # This relaxed definition, compared to imul, allows an optimization
1123 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1124 # sources, such that lower precision integer multiplies can be used.
1125 # This is useful on hw that has 24b or perhaps 16b integer multiply
1126 # instructions.
1127 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1128
1129 # ir3-specific instruction that maps directly to mul-add shift high mix,
1130 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1131 # multiplication (imul) on Freedreno backend..
1132 opcode("imadsh_mix16", 1, tint32,
1133        [1, 1, 1], [tint32, tint32, tint32], False, "", """
1134 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1135 """)
1136
1137 # ir3-specific instruction that maps directly to ir3 mad.s24.
1138 #
1139 # 24b multiply into 32b result (with sign extension) plus 32b int
1140 triop("imad24_ir3", tint32, _2src_commutative,
1141       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1142
1143 # 24b multiply into 32b result (with sign extension)
1144 binop("imul24", tint32, _2src_commutative + associative,
1145       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")