src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool8 = "bool8"
  99 tbool16 = "bool16"
 100 tbool32 = "bool32"
 101 tuint = "uint"
 102 tuint8 = "uint8"
 103 tuint16 = "uint16"
 104 tfloat16 = "float16"
 105 tfloat32 = "float32"
 106 tint32 = "int32"
 107 tuint32 = "uint32"
 108 tint64 = "int64"
 109 tuint64 = "uint64"
 110 tfloat64 = "float64"
 111
 112 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 113
 114 def type_has_size(type_):
 115     m = _TYPE_SPLIT_RE.match(type_)
 116     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 117     return m.group('bits') is not None
 118
 119 def type_size(type_):
 120     m = _TYPE_SPLIT_RE.match(type_)
 121     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 122     assert m.group('bits') is not None, \
 123            'NIR type string has no bit size: "{}"'.format(type_)
 124     return int(m.group('bits'))
 125
 126 def type_sizes(type_):
 127     if type_has_size(type_):
 128         return [type_size(type_)]
 129     elif type_ == 'bool':
 130         return [1, 8, 16, 32]
 131     elif type_ == 'float':
 132         return [16, 32, 64]
 133     else:
 134         return [1, 8, 16, 32, 64]
 135
 136 def type_base_type(type_):
 137     m = _TYPE_SPLIT_RE.match(type_)
 138     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 139     return m.group('type')
 140
 141 # Operation where the first two sources are commutative.
 142 #
 143 # For 2-source operations, this just mathematical commutativity.  Some
 144 # 3-source operations, like ffma, are only commutative in the first two
 145 # sources.
 146 _2src_commutative = "2src_commutative "
 147 associative = "associative "
 148
 149 # global dictionary of opcodes
 150 opcodes = {}
 151
 152 def opcode(name, output_size, output_type, input_sizes, input_types,
 153            is_conversion, algebraic_properties, const_expr):
 154    assert name not in opcodes
 155    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 156                           input_types, is_conversion, algebraic_properties,
 157                           const_expr)
 158
 159 def unop_convert(name, out_type, in_type, const_expr):
 160    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 161
 162 def unop(name, ty, const_expr):
 163    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 164
 165 def unop_horiz(name, output_size, output_type, input_size, input_type,
 166                const_expr):
 167    opcode(name, output_size, output_type, [input_size], [input_type],
 168           False, "", const_expr)
 169
 170 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 171                 reduce_expr, final_expr):
 172    def prereduce(src):
 173       return "(" + prereduce_expr.format(src=src) + ")"
 174    def final(src):
 175       return final_expr.format(src="(" + src + ")")
 176    def reduce_(src0, src1):
 177       return reduce_expr.format(src0=src0, src1=src1)
 178    src0 = prereduce("src0.x")
 179    src1 = prereduce("src0.y")
 180    src2 = prereduce("src0.z")
 181    src3 = prereduce("src0.w")
 182    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 183               final(reduce_(src0, src1)))
 184    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 185               final(reduce_(reduce_(src0, src1), src2)))
 186    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 187               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 188
 189 def unop_numeric_convert(name, out_type, in_type, const_expr):
 190    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 191
 192 unop("mov", tuint, "src0")
 193
 194 unop("ineg", tint, "-src0")
 195 unop("fneg", tfloat, "-src0")
 196 unop("inot", tint, "~src0") # invert every bit of the integer
 197 unop("fsign", tfloat, ("bit_size == 64 ? " +
 198                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 199                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 200 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 201 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 202 unop("fabs", tfloat, "fabs(src0)")
 203 unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
 204 unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
 205 unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)"))
 206 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 207 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 208 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 209 unop("fexp2", tfloat, "exp2f(src0)")
 210 unop("flog2", tfloat, "log2f(src0)")
 211
 212 # Generate all of the numeric conversion opcodes
 213 for src_t in [tint, tuint, tfloat, tbool]:
 214    if src_t == tbool:
 215       dst_types = [tfloat, tint, tbool]
 216    elif src_t == tint:
 217       dst_types = [tfloat, tint, tbool]
 218    elif src_t == tuint:
 219       dst_types = [tfloat, tuint]
 220    elif src_t == tfloat:
 221       dst_types = [tint, tuint, tfloat, tbool]
 222
 223    for dst_t in dst_types:
 224       for dst_bit_size in type_sizes(dst_t):
 225           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 226               rnd_modes = ['_rtne', '_rtz', '']
 227               for rnd_mode in rnd_modes:
 228                   if rnd_mode == '_rtne':
 229                       conv_expr = """
 230                       if (bit_size > 16) {
 231                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 232                       } else {
 233                          dst = src0;
 234                       }
 235                       """
 236                   elif rnd_mode == '_rtz':
 237                       conv_expr = """
 238                       if (bit_size > 16) {
 239                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 240                       } else {
 241                          dst = src0;
 242                       }
 243                       """
 244                   else:
 245                       conv_expr = "src0"
 246
 247                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 248                                                               dst_t[0],
 249                                                               dst_bit_size,
 250                                                               rnd_mode),
 251                                        dst_t + str(dst_bit_size),
 252                                        src_t, conv_expr)
 253           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 254               conv_expr = """
 255               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 256                  dst = _mesa_double_to_float_rtz(src0);
 257               } else {
 258                  dst = src0;
 259               }
 260               """
 261               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 262                                                        dst_bit_size),
 263                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 264           else:
 265               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 266               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 267                                                        dst_bit_size),
 268                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 269
 270 # Special opcode that is the same as f2f16 except that it is safe to remove it
 271 # if the result is immediately converted back to float32 again. This is
 272 # generated as part of the precision lowering pass. mp stands for medium
 273 # precision.
 274 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
 275
 276 # Unary floating-point rounding operations.
 277
 278
 279 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 280 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 281 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 282 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 283 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 284
 285 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 286
 287 # Trigonometric operations.
 288
 289
 290 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 291 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 292
 293 # dfrexp
 294 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 295 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 296
 297 # Partial derivatives.
 298
 299
 300 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 301 unop("fddy", tfloat, "0.0")
 302 unop("fddx_fine", tfloat, "0.0")
 303 unop("fddy_fine", tfloat, "0.0")
 304 unop("fddx_coarse", tfloat, "0.0")
 305 unop("fddy_coarse", tfloat, "0.0")
 306
 307
 308 # Floating point pack and unpack operations.
 309
 310 def pack_2x16(fmt):
 311    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 312 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 313 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 314 """.replace("fmt", fmt))
 315
 316 def pack_4x8(fmt):
 317    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 318 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 319 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 320 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 321 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 322 """.replace("fmt", fmt))
 323
 324 def unpack_2x16(fmt):
 325    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 326 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 327 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 328 """.replace("fmt", fmt))
 329
 330 def unpack_4x8(fmt):
 331    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 332 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 333 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 334 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 335 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 336 """.replace("fmt", fmt))
 337
 338
 339 pack_2x16("snorm")
 340 pack_4x8("snorm")
 341 pack_2x16("unorm")
 342 pack_4x8("unorm")
 343 pack_2x16("half")
 344 unpack_2x16("snorm")
 345 unpack_4x8("snorm")
 346 unpack_2x16("unorm")
 347 unpack_4x8("unorm")
 348 unpack_2x16("half")
 349
 350 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 351 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 352 """)
 353
 354 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 355 dst.x = (src0.x <<  0) |
 356         (src0.y <<  8) |
 357         (src0.z << 16) |
 358         (src0.w << 24);
 359 """)
 360
 361 unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
 362            "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
 363
 364 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 365            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 366
 367 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 368            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 369
 370 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 371            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 372
 373 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 374            "dst.x = src0.x; dst.y = src0.x >> 32;")
 375
 376 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 377            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 378
 379 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 380            "dst.x = src0.x; dst.y = src0.x >> 16;")
 381
 382 unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
 383            "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
 384
 385 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 386 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 387 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 388 """)
 389
 390 # Lowered floating point unpacking operations.
 391
 392 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 393              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 394 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 395              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 396
 397 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 398              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 399 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 400              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 401
 402 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 403 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 404
 405 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 406 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 407
 408 # Bit operations, part of ARB_gpu_shader5.
 409
 410
 411 unop("bitfield_reverse", tuint32, """
 412 /* we're not winning any awards for speed here, but that's ok */
 413 dst = 0;
 414 for (unsigned bit = 0; bit < 32; bit++)
 415    dst |= ((src0 >> bit) & 1) << (31 - bit);
 416 """)
 417 unop_convert("bit_count", tuint32, tuint, """
 418 dst = 0;
 419 for (unsigned bit = 0; bit < bit_size; bit++) {
 420    if ((src0 >> bit) & 1)
 421       dst++;
 422 }
 423 """)
 424
 425 unop_convert("ufind_msb", tint32, tuint, """
 426 dst = -1;
 427 for (int bit = bit_size - 1; bit >= 0; bit--) {
 428    if ((src0 >> bit) & 1) {
 429       dst = bit;
 430       break;
 431    }
 432 }
 433 """)
 434
 435 unop("uclz", tuint32, """
 436 int bit;
 437 for (bit = bit_size - 1; bit >= 0; bit--) {
 438    if ((src0 & (1u << bit)) != 0)
 439       break;
 440 }
 441 dst = (unsigned)(31 - bit);
 442 """)
 443
 444 unop("ifind_msb", tint32, """
 445 dst = -1;
 446 for (int bit = 31; bit >= 0; bit--) {
 447    /* If src0 < 0, we're looking for the first 0 bit.
 448     * if src0 >= 0, we're looking for the first 1 bit.
 449     */
 450    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 451       (!((src0 >> bit) & 1) && (src0 < 0))) {
 452       dst = bit;
 453       break;
 454    }
 455 }
 456 """)
 457
 458 unop_convert("find_lsb", tint32, tint, """
 459 dst = -1;
 460 for (unsigned bit = 0; bit < bit_size; bit++) {
 461    if ((src0 >> bit) & 1) {
 462       dst = bit;
 463       break;
 464    }
 465 }
 466 """)
 467
 468 # AMD_gcn_shader extended instructions
 469 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 470 dst.x = dst.y = 0.0;
 471 float absX = fabsf(src0.x);
 472 float absY = fabsf(src0.y);
 473 float absZ = fabsf(src0.z);
 474
 475 float ma = 0.0;
 476 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 477 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 478 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 479
 480 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 481 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 482 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 483 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 484 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 485 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 486
 487 dst.x = dst.x / ma + 0.5;
 488 dst.y = dst.y / ma + 0.5;
 489 """)
 490
 491 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 492 float absX = fabsf(src0.x);
 493 float absY = fabsf(src0.y);
 494 float absZ = fabsf(src0.z);
 495 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 496 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 497 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 498 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 499 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 500 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 501 """)
 502
 503 # Sum of vector components
 504 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 505
 506 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 507    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 508           False, alg_props, const_expr)
 509
 510 def binop(name, ty, alg_props, const_expr):
 511    binop_convert(name, ty, ty, alg_props, const_expr)
 512
 513 def binop_compare(name, ty, alg_props, const_expr):
 514    binop_convert(name, tbool1, ty, alg_props, const_expr)
 515
 516 def binop_compare8(name, ty, alg_props, const_expr):
 517    binop_convert(name, tbool8, ty, alg_props, const_expr)
 518
 519 def binop_compare16(name, ty, alg_props, const_expr):
 520    binop_convert(name, tbool16, ty, alg_props, const_expr)
 521
 522 def binop_compare32(name, ty, alg_props, const_expr):
 523    binop_convert(name, tbool32, ty, alg_props, const_expr)
 524
 525 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 526    binop_compare(name, ty, alg_props, const_expr)
 527    binop_compare8(name + "8", ty, alg_props, const_expr)
 528    binop_compare16(name + "16", ty, alg_props, const_expr)
 529    binop_compare32(name + "32", ty, alg_props, const_expr)
 530
 531 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 532                 src2_type, const_expr):
 533    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 534           False, "", const_expr)
 535
 536 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 537                  reduce_expr, final_expr):
 538    def final(src):
 539       return final_expr.format(src= "(" + src + ")")
 540    def reduce_(src0, src1):
 541       return reduce_expr.format(src0=src0, src1=src1)
 542    def prereduce(src0, src1):
 543       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 544    src0 = prereduce("src0.x", "src1.x")
 545    src1 = prereduce("src0.y", "src1.y")
 546    src2 = prereduce("src0.z", "src1.z")
 547    src3 = prereduce("src0.w", "src1.w")
 548    opcode(name + "2", output_size, output_type,
 549           [2, 2], [src_type, src_type], False, _2src_commutative,
 550           final(reduce_(src0, src1)))
 551    opcode(name + "3", output_size, output_type,
 552           [3, 3], [src_type, src_type], False, _2src_commutative,
 553           final(reduce_(reduce_(src0, src1), src2)))
 554    opcode(name + "4", output_size, output_type,
 555           [4, 4], [src_type, src_type], False, _2src_commutative,
 556           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 557
 558 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 559                            reduce_expr, final_expr):
 560    binop_reduce(name, output_size, tbool1, src_type,
 561                 prereduce_expr, reduce_expr, final_expr)
 562    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
 563                 prereduce_expr, reduce_expr, final_expr)
 564    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 565                 prereduce_expr, reduce_expr, final_expr)
 566    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 567                 prereduce_expr, reduce_expr, final_expr)
 568
 569 binop("fadd", tfloat, _2src_commutative + associative,"""
 570 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 571    if (bit_size == 64)
 572       dst = _mesa_double_add_rtz(src0, src1);
 573    else
 574       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 575 } else {
 576    dst = src0 + src1;
 577 }
 578 """)
 579 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 580 binop("iadd_sat", tint, _2src_commutative, """
 581       src1 > 0 ?
 582          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 583          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 584 """)
 585 binop("uadd_sat", tuint, _2src_commutative,
 586       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 587 binop("isub_sat", tint, "", """
 588       src1 < 0 ?
 589          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 590          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 591 """)
 592 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 593
 594 binop("fsub", tfloat, "", """
 595 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 596    if (bit_size == 64)
 597       dst = _mesa_double_sub_rtz(src0, src1);
 598    else
 599       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 600 } else {
 601    dst = src0 - src1;
 602 }
 603 """)
 604 binop("isub", tint, "", "src0 - src1")
 605 binop_convert("uabs_isub", tuint, tint, "", """
 606               src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
 607                           : (uint64_t) src0 - (uint64_t) src1
 608 """)
 609 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
 610
 611 binop("fmul", tfloat, _2src_commutative + associative, """
 612 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 613    if (bit_size == 64)
 614       dst = _mesa_double_mul_rtz(src0, src1);
 615    else
 616       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 617 } else {
 618    dst = src0 * src1;
 619 }
 620 """)
 621 # low 32-bits of signed/unsigned integer multiply
 622 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 623
 624 # Generate 64 bit result from 2 32 bits quantity
 625 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 626               "(int64_t)src0 * (int64_t)src1")
 627 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 628               "(uint64_t)src0 * (uint64_t)src1")
 629
 630 # high 32-bits of signed integer multiply
 631 binop("imul_high", tint, _2src_commutative, """
 632 if (bit_size == 64) {
 633    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 634     * extension to work properly.  The casts are kind-of annoying but needed
 635     * to prevent compiler warnings.
 636     */
 637    uint32_t src0_u32[4] = {
 638       src0,
 639       (int64_t)src0 >> 32,
 640       (int64_t)src0 >> 63,
 641       (int64_t)src0 >> 63,
 642    };
 643    uint32_t src1_u32[4] = {
 644       src1,
 645       (int64_t)src1 >> 32,
 646       (int64_t)src1 >> 63,
 647       (int64_t)src1 >> 63,
 648    };
 649    uint32_t prod_u32[4];
 650    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 651    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 652 } else {
 653    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 654 }
 655 """)
 656
 657 # high 32-bits of unsigned integer multiply
 658 binop("umul_high", tuint, _2src_commutative, """
 659 if (bit_size == 64) {
 660    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 661    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 662    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 663    uint32_t prod_u32[4];
 664    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 665    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 666 } else {
 667    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 668 }
 669 """)
 670
 671 # low 32-bits of unsigned integer multiply
 672 binop("umul_low", tuint32, _2src_commutative, """
 673 uint64_t mask = (1 << (bit_size / 2)) - 1;
 674 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 675 """)
 676
 677 # Multiply 32-bits with low 16-bits.
 678 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
 679 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
 680
 681 binop("fdiv", tfloat, "", "src0 / src1")
 682 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 683 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 684
 685 # returns a boolean representing the carry resulting from the addition of
 686 # the two unsigned arguments.
 687
 688 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 689
 690 # returns a boolean representing the borrow resulting from the subtraction
 691 # of the two unsigned arguments.
 692
 693 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 694
 695 # hadd: (a + b) >> 1 (without overflow)
 696 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 697 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 698 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 699 #       =     ((x & y) << 1) + (x ^ y)
 700 #
 701 # Since we know that the bottom bit of (x & y) << 1 is zero,
 702 #
 703 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 704 #              =   (x & y) +      ((x ^ y)  >> 1)
 705 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 706 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 707
 708 # rhadd: (a + b + 1) >> 1 (without overflow)
 709 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 710 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 711 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 712 #           =     ((x | y) << 1) - (x ^ y) + 1
 713 #
 714 # Since we know that the bottom bit of (x & y) << 1 is zero,
 715 #
 716 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 717 #                  = (x | y) -  ((x ^ y)      >> 1)
 718 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 719 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 720
 721 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 722
 723 # For signed integers, there are several different possible definitions of
 724 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 725 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 726 # operation while the imod opcode implements the more mathematical
 727 # "modulus" operation.  For details on the difference, see
 728 #
 729 # http://mathforum.org/library/drmath/view/52343.html
 730
 731 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 732 binop("imod", tint, "",
 733       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 734       "                 src0 % src1 : src0 % src1 + src1)")
 735 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 736 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 737
 738 #
 739 # Comparisons
 740 #
 741
 742
 743 # these integer-aware comparisons return a boolean (0 or ~0)
 744
 745 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 746 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 747 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 748 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 749 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 750 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 751 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 752 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 753 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 754 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 755
 756 # integer-aware GLSL-style comparisons that compare floats and ints
 757
 758 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 759                        "{src0} && {src1}", "{src}")
 760 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 761                        "{src0} || {src1}", "{src}")
 762 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 763                        "{src0} && {src1}", "{src}")
 764 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 765                        "{src0} || {src1}", "{src}")
 766
 767 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 768
 769 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 770              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 771 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 772              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 773
 774 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 775 # and false respectively
 776
 777 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 778 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 779 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 780 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 781
 782 # SPIRV shifts are undefined for shift-operands >= bitsize,
 783 # but SM5 shifts are defined to use the least significant bits, only
 784 # The NIR definition is according to the SM5 specification.
 785 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 786        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 787 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 788        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 789 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 790        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 791
 792 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 793    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 794    dst = (src0 << (src1 & rotate_mask)) |
 795          (src0 >> (-src1 & rotate_mask));
 796 """)
 797 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 798    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 799    dst = (src0 >> (src1 & rotate_mask)) |
 800          (src0 << (-src1 & rotate_mask));
 801 """)
 802
 803 # bitwise logic operators
 804 #
 805 # These are also used as boolean and, or, xor for hardware supporting
 806 # integers.
 807
 808
 809 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 810 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 811 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 812
 813
 814 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 815              "{src}")
 816
 817 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 818              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 819
 820 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 821        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 822 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 823        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 824
 825 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
 826 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 827 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 828 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
 829 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 830 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 831
 832 # Saturated vector add for 4 8bit ints.
 833 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 834 dst = 0;
 835 for (int i = 0; i < 32; i += 8) {
 836    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 837 }
 838 """)
 839
 840 # Saturated vector subtract for 4 8bit ints.
 841 binop("ussub_4x8", tint32, "", """
 842 dst = 0;
 843 for (int i = 0; i < 32; i += 8) {
 844    int src0_chan = (src0 >> i) & 0xff;
 845    int src1_chan = (src1 >> i) & 0xff;
 846    if (src0_chan > src1_chan)
 847       dst |= (src0_chan - src1_chan) << i;
 848 }
 849 """)
 850
 851 # vector min for 4 8bit ints.
 852 binop("umin_4x8", tint32, _2src_commutative + associative, """
 853 dst = 0;
 854 for (int i = 0; i < 32; i += 8) {
 855    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 856 }
 857 """)
 858
 859 # vector max for 4 8bit ints.
 860 binop("umax_4x8", tint32, _2src_commutative + associative, """
 861 dst = 0;
 862 for (int i = 0; i < 32; i += 8) {
 863    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 864 }
 865 """)
 866
 867 # unorm multiply: (a * b) / 255.
 868 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 869 dst = 0;
 870 for (int i = 0; i < 32; i += 8) {
 871    int src0_chan = (src0 >> i) & 0xff;
 872    int src1_chan = (src1 >> i) & 0xff;
 873    dst |= ((src0_chan * src1_chan) / 255) << i;
 874 }
 875 """)
 876
 877 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 878
 879 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 880             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 881
 882 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 883               "src0 | ((uint64_t)src1 << 32)")
 884
 885 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 886               "src0 | ((uint32_t)src1 << 16)")
 887
 888 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 889 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 890 # are from the low five bits of src0 and src1, respectively.
 891 binop_convert("bfm", tuint32, tint32, "", """
 892 int bits = src0 & 0x1F;
 893 int offset = src1 & 0x1F;
 894 dst = ((1u << bits) - 1) << offset;
 895 """)
 896
 897 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 898 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 899 /* flush denormals to zero. */
 900 if (!isnormal(dst))
 901    dst = copysignf(0.0f, src0);
 902 """)
 903
 904 # Combines the first component of each input to make a 2-component vector.
 905
 906 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 907 dst.x = src0.x;
 908 dst.y = src1.x;
 909 """)
 910
 911 # Byte extraction
 912 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 913 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 914
 915 # Word extraction
 916 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 917 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 918
 919
 920 def triop(name, ty, alg_props, const_expr):
 921    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 922 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 923    opcode(name, output_size, tuint,
 924    [src1_size, src2_size, src3_size],
 925    [tuint, tuint, tuint], False, "", const_expr)
 926
 927 triop("ffma", tfloat, _2src_commutative, """
 928 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 929    if (bit_size == 64)
 930       dst = _mesa_double_fma_rtz(src0, src1, src2);
 931    else if (bit_size == 32)
 932       dst = _mesa_float_fma_rtz(src0, src1, src2);
 933    else
 934       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 935 } else {
 936    if (bit_size == 32)
 937       dst = fmaf(src0, src1, src2);
 938    else
 939       dst = fma(src0, src1, src2);
 940 }
 941 """)
 942
 943 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 944
 945 # Conditional Select
 946 #
 947 # A vector conditional select instruction (like ?:, but operating per-
 948 # component on vectors). There are two versions, one for floating point
 949 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 950
 951
 952 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 953
 954 # 3 way min/max/med
 955 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 956 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 957 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 958
 959 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 960 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 961 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 962
 963 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 964 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 965 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 966
 967 opcode("bcsel", 0, tuint, [0, 0, 0],
 968        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 969 opcode("b8csel", 0, tuint, [0, 0, 0],
 970        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
 971 opcode("b16csel", 0, tuint, [0, 0, 0],
 972        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 973 opcode("b32csel", 0, tuint, [0, 0, 0],
 974        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 975
 976 # SM5 bfi assembly
 977 triop("bfi", tuint32, "", """
 978 unsigned mask = src0, insert = src1, base = src2;
 979 if (mask == 0) {
 980    dst = base;
 981 } else {
 982    unsigned tmp = mask;
 983    while (!(tmp & 1)) {
 984       tmp >>= 1;
 985       insert <<= 1;
 986    }
 987    dst = (base & ~mask) | (insert & mask);
 988 }
 989 """)
 990
 991
 992 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 993
 994 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 995 opcode("ubfe", 0, tuint32,
 996        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 997 unsigned base = src0;
 998 unsigned offset = src1 & 0x1F;
 999 unsigned bits = src2 & 0x1F;
1000 if (bits == 0) {
1001    dst = 0;
1002 } else if (offset + bits < 32) {
1003    dst = (base << (32 - bits - offset)) >> (32 - bits);
1004 } else {
1005    dst = base >> offset;
1006 }
1007 """)
1008 opcode("ibfe", 0, tint32,
1009        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1010 int base = src0;
1011 unsigned offset = src1 & 0x1F;
1012 unsigned bits = src2 & 0x1F;
1013 if (bits == 0) {
1014    dst = 0;
1015 } else if (offset + bits < 32) {
1016    dst = (base << (32 - bits - offset)) >> (32 - bits);
1017 } else {
1018    dst = base >> offset;
1019 }
1020 """)
1021
1022 # GLSL bitfieldExtract()
1023 opcode("ubitfield_extract", 0, tuint32,
1024        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1025 unsigned base = src0;
1026 int offset = src1, bits = src2;
1027 if (bits == 0) {
1028    dst = 0;
1029 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1030    dst = 0; /* undefined per the spec */
1031 } else {
1032    dst = (base >> offset) & ((1ull << bits) - 1);
1033 }
1034 """)
1035 opcode("ibitfield_extract", 0, tint32,
1036        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1037 int base = src0;
1038 int offset = src1, bits = src2;
1039 if (bits == 0) {
1040    dst = 0;
1041 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1042    dst = 0;
1043 } else {
1044    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1045 }
1046 """)
1047
1048 # Combines the first component of each input to make a 3-component vector.
1049
1050 triop_horiz("vec3", 3, 1, 1, 1, """
1051 dst.x = src0.x;
1052 dst.y = src1.x;
1053 dst.z = src2.x;
1054 """)
1055
1056 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1057                  src4_size, const_expr):
1058    opcode(name, output_size, tuint,
1059           [src1_size, src2_size, src3_size, src4_size],
1060           [tuint, tuint, tuint, tuint],
1061           False, "", const_expr)
1062
1063 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1064        [tuint32, tuint32, tint32, tint32], False, "", """
1065 unsigned base = src0, insert = src1;
1066 int offset = src2, bits = src3;
1067 if (bits == 0) {
1068    dst = base;
1069 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1070    dst = 0;
1071 } else {
1072    unsigned mask = ((1ull << bits) - 1) << offset;
1073    dst = (base & ~mask) | ((insert << offset) & mask);
1074 }
1075 """)
1076
1077 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1078 dst.x = src0.x;
1079 dst.y = src1.x;
1080 dst.z = src2.x;
1081 dst.w = src3.x;
1082 """)
1083
1084 opcode("vec8", 8, tuint,
1085        [1] * 8, [tuint] * 8,
1086        False, "", """
1087 dst.x = src0.x;
1088 dst.y = src1.x;
1089 dst.z = src2.x;
1090 dst.w = src3.x;
1091 dst.e = src4.x;
1092 dst.f = src5.x;
1093 dst.g = src6.x;
1094 dst.h = src7.x;
1095 """)
1096
1097 opcode("vec16", 16, tuint,
1098        [1] * 16, [tuint] * 16,
1099        False, "", """
1100 dst.x = src0.x;
1101 dst.y = src1.x;
1102 dst.z = src2.x;
1103 dst.w = src3.x;
1104 dst.e = src4.x;
1105 dst.f = src5.x;
1106 dst.g = src6.x;
1107 dst.h = src7.x;
1108 dst.i = src8.x;
1109 dst.j = src9.x;
1110 dst.k = src10.x;
1111 dst.l = src11.x;
1112 dst.m = src12.x;
1113 dst.n = src13.x;
1114 dst.o = src14.x;
1115 dst.p = src15.x;
1116 """)
1117
1118 # An integer multiply instruction for address calculation.  This is
1119 # similar to imul, except that the results are undefined in case of
1120 # overflow.  Overflow is defined according to the size of the variable
1121 # being dereferenced.
1122 #
1123 # This relaxed definition, compared to imul, allows an optimization
1124 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1125 # sources, such that lower precision integer multiplies can be used.
1126 # This is useful on hw that has 24b or perhaps 16b integer multiply
1127 # instructions.
1128 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1129
1130 # ir3-specific instruction that maps directly to mul-add shift high mix,
1131 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1132 # multiplication (imul) on Freedreno backend..
1133 opcode("imadsh_mix16", 0, tint32,
1134        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1135 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1136 """)
1137
1138 # ir3-specific instruction that maps directly to ir3 mad.s24.
1139 #
1140 # 24b multiply into 32b result (with sign extension) plus 32b int
1141 triop("imad24_ir3", tint32, _2src_commutative,
1142       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1143
1144 # 24b multiply into 32b result (with sign extension)
1145 binop("imul24", tint32, _2src_commutative + associative,
1146       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1147
1148 # unsigned 24b multiply into 32b result plus 32b int
1149 triop("umad24", tuint32, _2src_commutative,
1150       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1151
1152 # unsigned 24b multiply into 32b result uint
1153 binop("umul24", tint32, _2src_commutative + associative,
1154       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")