src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool8 = "bool8"
  99 tbool16 = "bool16"
 100 tbool32 = "bool32"
 101 tuint = "uint"
 102 tuint16 = "uint16"
 103 tfloat16 = "float16"
 104 tfloat32 = "float32"
 105 tint32 = "int32"
 106 tuint32 = "uint32"
 107 tint64 = "int64"
 108 tuint64 = "uint64"
 109 tfloat64 = "float64"
 110
 111 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 112
 113 def type_has_size(type_):
 114     m = _TYPE_SPLIT_RE.match(type_)
 115     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 116     return m.group('bits') is not None
 117
 118 def type_size(type_):
 119     m = _TYPE_SPLIT_RE.match(type_)
 120     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 121     assert m.group('bits') is not None, \
 122            'NIR type string has no bit size: "{}"'.format(type_)
 123     return int(m.group('bits'))
 124
 125 def type_sizes(type_):
 126     if type_has_size(type_):
 127         return [type_size(type_)]
 128     elif type_ == 'bool':
 129         return [1, 8, 16, 32]
 130     elif type_ == 'float':
 131         return [16, 32, 64]
 132     else:
 133         return [1, 8, 16, 32, 64]
 134
 135 def type_base_type(type_):
 136     m = _TYPE_SPLIT_RE.match(type_)
 137     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 138     return m.group('type')
 139
 140 # Operation where the first two sources are commutative.
 141 #
 142 # For 2-source operations, this just mathematical commutativity.  Some
 143 # 3-source operations, like ffma, are only commutative in the first two
 144 # sources.
 145 _2src_commutative = "2src_commutative "
 146 associative = "associative "
 147
 148 # global dictionary of opcodes
 149 opcodes = {}
 150
 151 def opcode(name, output_size, output_type, input_sizes, input_types,
 152            is_conversion, algebraic_properties, const_expr):
 153    assert name not in opcodes
 154    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 155                           input_types, is_conversion, algebraic_properties,
 156                           const_expr)
 157
 158 def unop_convert(name, out_type, in_type, const_expr):
 159    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 160
 161 def unop(name, ty, const_expr):
 162    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 163
 164 def unop_horiz(name, output_size, output_type, input_size, input_type,
 165                const_expr):
 166    opcode(name, output_size, output_type, [input_size], [input_type],
 167           False, "", const_expr)
 168
 169 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 170                 reduce_expr, final_expr):
 171    def prereduce(src):
 172       return "(" + prereduce_expr.format(src=src) + ")"
 173    def final(src):
 174       return final_expr.format(src="(" + src + ")")
 175    def reduce_(src0, src1):
 176       return reduce_expr.format(src0=src0, src1=src1)
 177    src0 = prereduce("src0.x")
 178    src1 = prereduce("src0.y")
 179    src2 = prereduce("src0.z")
 180    src3 = prereduce("src0.w")
 181    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 182               final(reduce_(src0, src1)))
 183    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 184               final(reduce_(reduce_(src0, src1), src2)))
 185    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 186               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 187
 188 def unop_numeric_convert(name, out_type, in_type, const_expr):
 189    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 190
 191 unop("mov", tuint, "src0")
 192
 193 unop("ineg", tint, "-src0")
 194 unop("fneg", tfloat, "-src0")
 195 unop("inot", tint, "~src0") # invert every bit of the integer
 196 unop("fsign", tfloat, ("bit_size == 64 ? " +
 197                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 198                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 199 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 200 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 201 unop("fabs", tfloat, "fabs(src0)")
 202 unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
 203 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 204 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 205 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 206 unop("fexp2", tfloat, "exp2f(src0)")
 207 unop("flog2", tfloat, "log2f(src0)")
 208
 209 # Generate all of the numeric conversion opcodes
 210 for src_t in [tint, tuint, tfloat, tbool]:
 211    if src_t == tbool:
 212       dst_types = [tfloat, tint, tbool]
 213    elif src_t == tint:
 214       dst_types = [tfloat, tint, tbool]
 215    elif src_t == tuint:
 216       dst_types = [tfloat, tuint]
 217    elif src_t == tfloat:
 218       dst_types = [tint, tuint, tfloat, tbool]
 219
 220    for dst_t in dst_types:
 221       for dst_bit_size in type_sizes(dst_t):
 222           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 223               rnd_modes = ['_rtne', '_rtz', '']
 224               for rnd_mode in rnd_modes:
 225                   if rnd_mode == '_rtne':
 226                       conv_expr = """
 227                       if (bit_size > 16) {
 228                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 229                       } else {
 230                          dst = src0;
 231                       }
 232                       """
 233                   elif rnd_mode == '_rtz':
 234                       conv_expr = """
 235                       if (bit_size > 16) {
 236                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 237                       } else {
 238                          dst = src0;
 239                       }
 240                       """
 241                   else:
 242                       conv_expr = "src0"
 243
 244                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 245                                                               dst_t[0],
 246                                                               dst_bit_size,
 247                                                               rnd_mode),
 248                                        dst_t + str(dst_bit_size),
 249                                        src_t, conv_expr)
 250           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 251               conv_expr = """
 252               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 253                  dst = _mesa_double_to_float_rtz(src0);
 254               } else {
 255                  dst = src0;
 256               }
 257               """
 258               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 259                                                        dst_bit_size),
 260                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 261           else:
 262               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 263               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 264                                                        dst_bit_size),
 265                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 266
 267 # Special opcode that is the same as f2f16 except that it is safe to remove it
 268 # if the result is immediately converted back to float32 again. This is
 269 # generated as part of the precision lowering pass. mp stands for medium
 270 # precision.
 271 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
 272
 273 # Unary floating-point rounding operations.
 274
 275
 276 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 277 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 278 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 279 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 280 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 281
 282 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 283
 284 # Trigonometric operations.
 285
 286
 287 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 288 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 289
 290 # dfrexp
 291 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 292 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 293
 294 # Partial derivatives.
 295
 296
 297 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 298 unop("fddy", tfloat, "0.0")
 299 unop("fddx_fine", tfloat, "0.0")
 300 unop("fddy_fine", tfloat, "0.0")
 301 unop("fddx_coarse", tfloat, "0.0")
 302 unop("fddy_coarse", tfloat, "0.0")
 303
 304
 305 # Floating point pack and unpack operations.
 306
 307 def pack_2x16(fmt):
 308    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 309 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 310 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 311 """.replace("fmt", fmt))
 312
 313 def pack_4x8(fmt):
 314    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 315 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 316 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 317 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 318 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 319 """.replace("fmt", fmt))
 320
 321 def unpack_2x16(fmt):
 322    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 323 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 324 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 325 """.replace("fmt", fmt))
 326
 327 def unpack_4x8(fmt):
 328    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 329 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 330 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 331 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 332 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 333 """.replace("fmt", fmt))
 334
 335
 336 pack_2x16("snorm")
 337 pack_4x8("snorm")
 338 pack_2x16("unorm")
 339 pack_4x8("unorm")
 340 pack_2x16("half")
 341 unpack_2x16("snorm")
 342 unpack_4x8("snorm")
 343 unpack_2x16("unorm")
 344 unpack_4x8("unorm")
 345 unpack_2x16("half")
 346
 347 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 348 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 349 """)
 350
 351 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 352 dst.x = (src0.x <<  0) |
 353         (src0.y <<  8) |
 354         (src0.z << 16) |
 355         (src0.w << 24);
 356 """)
 357
 358 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 359            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 360
 361 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 362            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 363
 364 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 365            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 366
 367 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 368            "dst.x = src0.x; dst.y = src0.x >> 32;")
 369
 370 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 371            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 372
 373 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 374            "dst.x = src0.x; dst.y = src0.x >> 16;")
 375
 376 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 377 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 378 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 379 """)
 380
 381 # Lowered floating point unpacking operations.
 382
 383 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 384              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 385 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 386              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 387
 388 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 389              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 390 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 391              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 392
 393 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 394 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 395
 396 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 397 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 398
 399 # Bit operations, part of ARB_gpu_shader5.
 400
 401
 402 unop("bitfield_reverse", tuint32, """
 403 /* we're not winning any awards for speed here, but that's ok */
 404 dst = 0;
 405 for (unsigned bit = 0; bit < 32; bit++)
 406    dst |= ((src0 >> bit) & 1) << (31 - bit);
 407 """)
 408 unop_convert("bit_count", tuint32, tuint, """
 409 dst = 0;
 410 for (unsigned bit = 0; bit < bit_size; bit++) {
 411    if ((src0 >> bit) & 1)
 412       dst++;
 413 }
 414 """)
 415
 416 unop_convert("ufind_msb", tint32, tuint, """
 417 dst = -1;
 418 for (int bit = bit_size - 1; bit >= 0; bit--) {
 419    if ((src0 >> bit) & 1) {
 420       dst = bit;
 421       break;
 422    }
 423 }
 424 """)
 425
 426 unop("uclz", tuint32, """
 427 int bit;
 428 for (bit = bit_size - 1; bit >= 0; bit--) {
 429    if ((src0 & (1u << bit)) != 0)
 430       break;
 431 }
 432 dst = (unsigned)(31 - bit);
 433 """)
 434
 435 unop("ifind_msb", tint32, """
 436 dst = -1;
 437 for (int bit = 31; bit >= 0; bit--) {
 438    /* If src0 < 0, we're looking for the first 0 bit.
 439     * if src0 >= 0, we're looking for the first 1 bit.
 440     */
 441    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 442       (!((src0 >> bit) & 1) && (src0 < 0))) {
 443       dst = bit;
 444       break;
 445    }
 446 }
 447 """)
 448
 449 unop_convert("find_lsb", tint32, tint, """
 450 dst = -1;
 451 for (unsigned bit = 0; bit < bit_size; bit++) {
 452    if ((src0 >> bit) & 1) {
 453       dst = bit;
 454       break;
 455    }
 456 }
 457 """)
 458
 459 # AMD_gcn_shader extended instructions
 460 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 461 dst.x = dst.y = 0.0;
 462 float absX = fabsf(src0.x);
 463 float absY = fabsf(src0.y);
 464 float absZ = fabsf(src0.z);
 465
 466 float ma = 0.0;
 467 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 468 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 469 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 470
 471 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 472 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 473 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 474 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 475 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 476 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 477
 478 dst.x = dst.x / ma + 0.5;
 479 dst.y = dst.y / ma + 0.5;
 480 """)
 481
 482 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 483 float absX = fabsf(src0.x);
 484 float absY = fabsf(src0.y);
 485 float absZ = fabsf(src0.z);
 486 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 487 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 488 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 489 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 490 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 491 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 492 """)
 493
 494 # Sum of vector components
 495 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 496
 497 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 498    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 499           False, alg_props, const_expr)
 500
 501 def binop(name, ty, alg_props, const_expr):
 502    binop_convert(name, ty, ty, alg_props, const_expr)
 503
 504 def binop_compare(name, ty, alg_props, const_expr):
 505    binop_convert(name, tbool1, ty, alg_props, const_expr)
 506
 507 def binop_compare8(name, ty, alg_props, const_expr):
 508    binop_convert(name, tbool8, ty, alg_props, const_expr)
 509
 510 def binop_compare16(name, ty, alg_props, const_expr):
 511    binop_convert(name, tbool16, ty, alg_props, const_expr)
 512
 513 def binop_compare32(name, ty, alg_props, const_expr):
 514    binop_convert(name, tbool32, ty, alg_props, const_expr)
 515
 516 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 517    binop_compare(name, ty, alg_props, const_expr)
 518    binop_compare8(name + "8", ty, alg_props, const_expr)
 519    binop_compare16(name + "16", ty, alg_props, const_expr)
 520    binop_compare32(name + "32", ty, alg_props, const_expr)
 521
 522 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 523                 src2_type, const_expr):
 524    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 525           False, "", const_expr)
 526
 527 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 528                  reduce_expr, final_expr):
 529    def final(src):
 530       return final_expr.format(src= "(" + src + ")")
 531    def reduce_(src0, src1):
 532       return reduce_expr.format(src0=src0, src1=src1)
 533    def prereduce(src0, src1):
 534       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 535    src0 = prereduce("src0.x", "src1.x")
 536    src1 = prereduce("src0.y", "src1.y")
 537    src2 = prereduce("src0.z", "src1.z")
 538    src3 = prereduce("src0.w", "src1.w")
 539    opcode(name + "2", output_size, output_type,
 540           [2, 2], [src_type, src_type], False, _2src_commutative,
 541           final(reduce_(src0, src1)))
 542    opcode(name + "3", output_size, output_type,
 543           [3, 3], [src_type, src_type], False, _2src_commutative,
 544           final(reduce_(reduce_(src0, src1), src2)))
 545    opcode(name + "4", output_size, output_type,
 546           [4, 4], [src_type, src_type], False, _2src_commutative,
 547           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 548
 549 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 550                            reduce_expr, final_expr):
 551    binop_reduce(name, output_size, tbool1, src_type,
 552                 prereduce_expr, reduce_expr, final_expr)
 553    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
 554                 prereduce_expr, reduce_expr, final_expr)
 555    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 556                 prereduce_expr, reduce_expr, final_expr)
 557    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 558                 prereduce_expr, reduce_expr, final_expr)
 559
 560 binop("fadd", tfloat, _2src_commutative + associative,"""
 561 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 562    if (bit_size == 64)
 563       dst = _mesa_double_add_rtz(src0, src1);
 564    else
 565       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 566 } else {
 567    dst = src0 + src1;
 568 }
 569 """)
 570 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 571 binop("iadd_sat", tint, _2src_commutative, """
 572       src1 > 0 ?
 573          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 574          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 575 """)
 576 binop("uadd_sat", tuint, _2src_commutative,
 577       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 578 binop("isub_sat", tint, "", """
 579       src1 < 0 ?
 580          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 581          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 582 """)
 583 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 584
 585 binop("fsub", tfloat, "", """
 586 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 587    if (bit_size == 64)
 588       dst = _mesa_double_sub_rtz(src0, src1);
 589    else
 590       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 591 } else {
 592    dst = src0 - src1;
 593 }
 594 """)
 595 binop("isub", tint, "", "src0 - src1")
 596 binop_convert("uabs_isub", tuint, tint, "", """
 597               src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
 598                           : (uint64_t) src0 - (uint64_t) src1
 599 """)
 600 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
 601
 602 binop("fmul", tfloat, _2src_commutative + associative, """
 603 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 604    if (bit_size == 64)
 605       dst = _mesa_double_mul_rtz(src0, src1);
 606    else
 607       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 608 } else {
 609    dst = src0 * src1;
 610 }
 611 """)
 612 # low 32-bits of signed/unsigned integer multiply
 613 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 614
 615 # Generate 64 bit result from 2 32 bits quantity
 616 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 617               "(int64_t)src0 * (int64_t)src1")
 618 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 619               "(uint64_t)src0 * (uint64_t)src1")
 620
 621 # high 32-bits of signed integer multiply
 622 binop("imul_high", tint, _2src_commutative, """
 623 if (bit_size == 64) {
 624    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 625     * extension to work properly.  The casts are kind-of annoying but needed
 626     * to prevent compiler warnings.
 627     */
 628    uint32_t src0_u32[4] = {
 629       src0,
 630       (int64_t)src0 >> 32,
 631       (int64_t)src0 >> 63,
 632       (int64_t)src0 >> 63,
 633    };
 634    uint32_t src1_u32[4] = {
 635       src1,
 636       (int64_t)src1 >> 32,
 637       (int64_t)src1 >> 63,
 638       (int64_t)src1 >> 63,
 639    };
 640    uint32_t prod_u32[4];
 641    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 642    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 643 } else {
 644    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 645 }
 646 """)
 647
 648 # high 32-bits of unsigned integer multiply
 649 binop("umul_high", tuint, _2src_commutative, """
 650 if (bit_size == 64) {
 651    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 652    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 653    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 654    uint32_t prod_u32[4];
 655    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 656    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 657 } else {
 658    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 659 }
 660 """)
 661
 662 # low 32-bits of unsigned integer multiply
 663 binop("umul_low", tuint32, _2src_commutative, """
 664 uint64_t mask = (1 << (bit_size / 2)) - 1;
 665 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 666 """)
 667
 668 # Multiply 32-bits with low 16-bits.
 669 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
 670 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
 671
 672 binop("fdiv", tfloat, "", "src0 / src1")
 673 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 674 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 675
 676 # returns a boolean representing the carry resulting from the addition of
 677 # the two unsigned arguments.
 678
 679 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 680
 681 # returns a boolean representing the borrow resulting from the subtraction
 682 # of the two unsigned arguments.
 683
 684 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 685
 686 # hadd: (a + b) >> 1 (without overflow)
 687 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 688 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 689 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 690 #       =     ((x & y) << 1) + (x ^ y)
 691 #
 692 # Since we know that the bottom bit of (x & y) << 1 is zero,
 693 #
 694 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 695 #              =   (x & y) +      ((x ^ y)  >> 1)
 696 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 697 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 698
 699 # rhadd: (a + b + 1) >> 1 (without overflow)
 700 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 701 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 702 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 703 #           =     ((x | y) << 1) - (x ^ y) + 1
 704 #
 705 # Since we know that the bottom bit of (x & y) << 1 is zero,
 706 #
 707 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 708 #                  = (x | y) -  ((x ^ y)      >> 1)
 709 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 710 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 711
 712 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 713
 714 # For signed integers, there are several different possible definitions of
 715 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 716 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 717 # operation while the imod opcode implements the more mathematical
 718 # "modulus" operation.  For details on the difference, see
 719 #
 720 # http://mathforum.org/library/drmath/view/52343.html
 721
 722 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 723 binop("imod", tint, "",
 724       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 725       "                 src0 % src1 : src0 % src1 + src1)")
 726 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 727 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 728
 729 #
 730 # Comparisons
 731 #
 732
 733
 734 # these integer-aware comparisons return a boolean (0 or ~0)
 735
 736 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 737 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 738 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 739 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 740 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 741 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 742 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 743 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 744 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 745 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 746
 747 # integer-aware GLSL-style comparisons that compare floats and ints
 748
 749 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 750                        "{src0} && {src1}", "{src}")
 751 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 752                        "{src0} || {src1}", "{src}")
 753 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 754                        "{src0} && {src1}", "{src}")
 755 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 756                        "{src0} || {src1}", "{src}")
 757
 758 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 759
 760 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 761              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 762 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 763              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 764
 765 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 766 # and false respectively
 767
 768 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 769 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 770 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 771 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 772
 773 # SPIRV shifts are undefined for shift-operands >= bitsize,
 774 # but SM5 shifts are defined to use the least significant bits, only
 775 # The NIR definition is according to the SM5 specification.
 776 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 777        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 778 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 779        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 780 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 781        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 782
 783 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 784    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 785    dst = (src0 << (src1 & rotate_mask)) |
 786          (src0 >> (-src1 & rotate_mask));
 787 """)
 788 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 789    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 790    dst = (src0 >> (src1 & rotate_mask)) |
 791          (src0 << (-src1 & rotate_mask));
 792 """)
 793
 794 # bitwise logic operators
 795 #
 796 # These are also used as boolean and, or, xor for hardware supporting
 797 # integers.
 798
 799
 800 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 801 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 802 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 803
 804
 805 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 806              "{src}")
 807
 808 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 809              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 810
 811 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 812        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 813 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 814        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 815
 816 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
 817 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 818 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 819 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
 820 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 821 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 822
 823 # Saturated vector add for 4 8bit ints.
 824 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 825 dst = 0;
 826 for (int i = 0; i < 32; i += 8) {
 827    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 828 }
 829 """)
 830
 831 # Saturated vector subtract for 4 8bit ints.
 832 binop("ussub_4x8", tint32, "", """
 833 dst = 0;
 834 for (int i = 0; i < 32; i += 8) {
 835    int src0_chan = (src0 >> i) & 0xff;
 836    int src1_chan = (src1 >> i) & 0xff;
 837    if (src0_chan > src1_chan)
 838       dst |= (src0_chan - src1_chan) << i;
 839 }
 840 """)
 841
 842 # vector min for 4 8bit ints.
 843 binop("umin_4x8", tint32, _2src_commutative + associative, """
 844 dst = 0;
 845 for (int i = 0; i < 32; i += 8) {
 846    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 847 }
 848 """)
 849
 850 # vector max for 4 8bit ints.
 851 binop("umax_4x8", tint32, _2src_commutative + associative, """
 852 dst = 0;
 853 for (int i = 0; i < 32; i += 8) {
 854    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 855 }
 856 """)
 857
 858 # unorm multiply: (a * b) / 255.
 859 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 860 dst = 0;
 861 for (int i = 0; i < 32; i += 8) {
 862    int src0_chan = (src0 >> i) & 0xff;
 863    int src1_chan = (src1 >> i) & 0xff;
 864    dst |= ((src0_chan * src1_chan) / 255) << i;
 865 }
 866 """)
 867
 868 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 869
 870 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 871             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 872
 873 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 874               "src0 | ((uint64_t)src1 << 32)")
 875
 876 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 877               "src0 | ((uint32_t)src1 << 16)")
 878
 879 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 880 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 881 # are from the low five bits of src0 and src1, respectively.
 882 binop_convert("bfm", tuint32, tint32, "", """
 883 int bits = src0 & 0x1F;
 884 int offset = src1 & 0x1F;
 885 dst = ((1u << bits) - 1) << offset;
 886 """)
 887
 888 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 889 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 890 /* flush denormals to zero. */
 891 if (!isnormal(dst))
 892    dst = copysignf(0.0f, src0);
 893 """)
 894
 895 # Combines the first component of each input to make a 2-component vector.
 896
 897 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 898 dst.x = src0.x;
 899 dst.y = src1.x;
 900 """)
 901
 902 # Byte extraction
 903 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 904 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 905
 906 # Word extraction
 907 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 908 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 909
 910
 911 def triop(name, ty, alg_props, const_expr):
 912    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 913 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 914    opcode(name, output_size, tuint,
 915    [src1_size, src2_size, src3_size],
 916    [tuint, tuint, tuint], False, "", const_expr)
 917
 918 triop("ffma", tfloat, _2src_commutative, """
 919 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 920    if (bit_size == 64)
 921       dst = _mesa_double_fma_rtz(src0, src1, src2);
 922    else if (bit_size == 32)
 923       dst = _mesa_float_fma_rtz(src0, src1, src2);
 924    else
 925       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 926 } else {
 927    if (bit_size == 32)
 928       dst = fmaf(src0, src1, src2);
 929    else
 930       dst = fma(src0, src1, src2);
 931 }
 932 """)
 933
 934 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 935
 936 # Conditional Select
 937 #
 938 # A vector conditional select instruction (like ?:, but operating per-
 939 # component on vectors). There are two versions, one for floating point
 940 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 941
 942
 943 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 944
 945 # 3 way min/max/med
 946 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 947 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 948 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 949
 950 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 951 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 952 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 953
 954 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 955 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 956 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 957
 958 opcode("bcsel", 0, tuint, [0, 0, 0],
 959        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 960 opcode("b8csel", 0, tuint, [0, 0, 0],
 961        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
 962 opcode("b16csel", 0, tuint, [0, 0, 0],
 963        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 964 opcode("b32csel", 0, tuint, [0, 0, 0],
 965        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 966
 967 # SM5 bfi assembly
 968 triop("bfi", tuint32, "", """
 969 unsigned mask = src0, insert = src1, base = src2;
 970 if (mask == 0) {
 971    dst = base;
 972 } else {
 973    unsigned tmp = mask;
 974    while (!(tmp & 1)) {
 975       tmp >>= 1;
 976       insert <<= 1;
 977    }
 978    dst = (base & ~mask) | (insert & mask);
 979 }
 980 """)
 981
 982
 983 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 984
 985 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 986 opcode("ubfe", 0, tuint32,
 987        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 988 unsigned base = src0;
 989 unsigned offset = src1 & 0x1F;
 990 unsigned bits = src2 & 0x1F;
 991 if (bits == 0) {
 992    dst = 0;
 993 } else if (offset + bits < 32) {
 994    dst = (base << (32 - bits - offset)) >> (32 - bits);
 995 } else {
 996    dst = base >> offset;
 997 }
 998 """)
 999 opcode("ibfe", 0, tint32,
1000        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1001 int base = src0;
1002 unsigned offset = src1 & 0x1F;
1003 unsigned bits = src2 & 0x1F;
1004 if (bits == 0) {
1005    dst = 0;
1006 } else if (offset + bits < 32) {
1007    dst = (base << (32 - bits - offset)) >> (32 - bits);
1008 } else {
1009    dst = base >> offset;
1010 }
1011 """)
1012
1013 # GLSL bitfieldExtract()
1014 opcode("ubitfield_extract", 0, tuint32,
1015        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1016 unsigned base = src0;
1017 int offset = src1, bits = src2;
1018 if (bits == 0) {
1019    dst = 0;
1020 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1021    dst = 0; /* undefined per the spec */
1022 } else {
1023    dst = (base >> offset) & ((1ull << bits) - 1);
1024 }
1025 """)
1026 opcode("ibitfield_extract", 0, tint32,
1027        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1028 int base = src0;
1029 int offset = src1, bits = src2;
1030 if (bits == 0) {
1031    dst = 0;
1032 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1033    dst = 0;
1034 } else {
1035    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1036 }
1037 """)
1038
1039 # Combines the first component of each input to make a 3-component vector.
1040
1041 triop_horiz("vec3", 3, 1, 1, 1, """
1042 dst.x = src0.x;
1043 dst.y = src1.x;
1044 dst.z = src2.x;
1045 """)
1046
1047 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1048                  src4_size, const_expr):
1049    opcode(name, output_size, tuint,
1050           [src1_size, src2_size, src3_size, src4_size],
1051           [tuint, tuint, tuint, tuint],
1052           False, "", const_expr)
1053
1054 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1055        [tuint32, tuint32, tint32, tint32], False, "", """
1056 unsigned base = src0, insert = src1;
1057 int offset = src2, bits = src3;
1058 if (bits == 0) {
1059    dst = base;
1060 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1061    dst = 0;
1062 } else {
1063    unsigned mask = ((1ull << bits) - 1) << offset;
1064    dst = (base & ~mask) | ((insert << offset) & mask);
1065 }
1066 """)
1067
1068 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1069 dst.x = src0.x;
1070 dst.y = src1.x;
1071 dst.z = src2.x;
1072 dst.w = src3.x;
1073 """)
1074
1075 opcode("vec8", 8, tuint,
1076        [1] * 8, [tuint] * 8,
1077        False, "", """
1078 dst.x = src0.x;
1079 dst.y = src1.x;
1080 dst.z = src2.x;
1081 dst.w = src3.x;
1082 dst.e = src4.x;
1083 dst.f = src5.x;
1084 dst.g = src6.x;
1085 dst.h = src7.x;
1086 """)
1087
1088 opcode("vec16", 16, tuint,
1089        [1] * 16, [tuint] * 16,
1090        False, "", """
1091 dst.x = src0.x;
1092 dst.y = src1.x;
1093 dst.z = src2.x;
1094 dst.w = src3.x;
1095 dst.e = src4.x;
1096 dst.f = src5.x;
1097 dst.g = src6.x;
1098 dst.h = src7.x;
1099 dst.i = src8.x;
1100 dst.j = src9.x;
1101 dst.k = src10.x;
1102 dst.l = src11.x;
1103 dst.m = src12.x;
1104 dst.n = src13.x;
1105 dst.o = src14.x;
1106 dst.p = src15.x;
1107 """)
1108
1109 # An integer multiply instruction for address calculation.  This is
1110 # similar to imul, except that the results are undefined in case of
1111 # overflow.  Overflow is defined according to the size of the variable
1112 # being dereferenced.
1113 #
1114 # This relaxed definition, compared to imul, allows an optimization
1115 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1116 # sources, such that lower precision integer multiplies can be used.
1117 # This is useful on hw that has 24b or perhaps 16b integer multiply
1118 # instructions.
1119 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1120
1121 # ir3-specific instruction that maps directly to mul-add shift high mix,
1122 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1123 # multiplication (imul) on Freedreno backend..
1124 opcode("imadsh_mix16", 0, tint32,
1125        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1126 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1127 """)
1128
1129 # ir3-specific instruction that maps directly to ir3 mad.s24.
1130 #
1131 # 24b multiply into 32b result (with sign extension) plus 32b int
1132 triop("imad24_ir3", tint32, _2src_commutative,
1133       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1134
1135 # 24b multiply into 32b result (with sign extension)
1136 binop("imul24", tint32, _2src_commutative + associative,
1137       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1138
1139 # unsigned 24b multiply into 32b result plus 32b int
1140 triop("umad24", tuint32, _2src_commutative,
1141       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1142
1143 # unsigned 24b multiply into 32b result uint
1144 binop("umul24", tint32, _2src_commutative + associative,
1145       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")