src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool8 = "bool8"
  99 tbool16 = "bool16"
 100 tbool32 = "bool32"
 101 tuint = "uint"
 102 tuint16 = "uint16"
 103 tfloat32 = "float32"
 104 tint32 = "int32"
 105 tuint32 = "uint32"
 106 tint64 = "int64"
 107 tuint64 = "uint64"
 108 tfloat64 = "float64"
 109
 110 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 111
 112 def type_has_size(type_):
 113     m = _TYPE_SPLIT_RE.match(type_)
 114     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 115     return m.group('bits') is not None
 116
 117 def type_size(type_):
 118     m = _TYPE_SPLIT_RE.match(type_)
 119     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 120     assert m.group('bits') is not None, \
 121            'NIR type string has no bit size: "{}"'.format(type_)
 122     return int(m.group('bits'))
 123
 124 def type_sizes(type_):
 125     if type_has_size(type_):
 126         return [type_size(type_)]
 127     elif type_ == 'bool':
 128         return [1, 8, 16, 32]
 129     elif type_ == 'float':
 130         return [16, 32, 64]
 131     else:
 132         return [1, 8, 16, 32, 64]
 133
 134 def type_base_type(type_):
 135     m = _TYPE_SPLIT_RE.match(type_)
 136     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 137     return m.group('type')
 138
 139 # Operation where the first two sources are commutative.
 140 #
 141 # For 2-source operations, this just mathematical commutativity.  Some
 142 # 3-source operations, like ffma, are only commutative in the first two
 143 # sources.
 144 _2src_commutative = "2src_commutative "
 145 associative = "associative "
 146
 147 # global dictionary of opcodes
 148 opcodes = {}
 149
 150 def opcode(name, output_size, output_type, input_sizes, input_types,
 151            is_conversion, algebraic_properties, const_expr):
 152    assert name not in opcodes
 153    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 154                           input_types, is_conversion, algebraic_properties,
 155                           const_expr)
 156
 157 def unop_convert(name, out_type, in_type, const_expr):
 158    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 159
 160 def unop(name, ty, const_expr):
 161    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 162
 163 def unop_horiz(name, output_size, output_type, input_size, input_type,
 164                const_expr):
 165    opcode(name, output_size, output_type, [input_size], [input_type],
 166           False, "", const_expr)
 167
 168 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 169                 reduce_expr, final_expr):
 170    def prereduce(src):
 171       return "(" + prereduce_expr.format(src=src) + ")"
 172    def final(src):
 173       return final_expr.format(src="(" + src + ")")
 174    def reduce_(src0, src1):
 175       return reduce_expr.format(src0=src0, src1=src1)
 176    src0 = prereduce("src0.x")
 177    src1 = prereduce("src0.y")
 178    src2 = prereduce("src0.z")
 179    src3 = prereduce("src0.w")
 180    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 181               final(reduce_(src0, src1)))
 182    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 183               final(reduce_(reduce_(src0, src1), src2)))
 184    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 185               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 186
 187 def unop_numeric_convert(name, out_type, in_type, const_expr):
 188    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 189
 190 unop("mov", tuint, "src0")
 191
 192 unop("ineg", tint, "-src0")
 193 unop("fneg", tfloat, "-src0")
 194 unop("inot", tint, "~src0") # invert every bit of the integer
 195 unop("fsign", tfloat, ("bit_size == 64 ? " +
 196                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 197                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 198 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 199 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 200 unop("fabs", tfloat, "fabs(src0)")
 201 unop("fsat", tfloat, ("bit_size == 64 ? " +
 202                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 203                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 204 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 205 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 206 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 207 unop("fexp2", tfloat, "exp2f(src0)")
 208 unop("flog2", tfloat, "log2f(src0)")
 209
 210 # Generate all of the numeric conversion opcodes
 211 for src_t in [tint, tuint, tfloat, tbool]:
 212    if src_t == tbool:
 213       dst_types = [tfloat, tint]
 214    elif src_t == tint:
 215       dst_types = [tfloat, tint, tbool]
 216    elif src_t == tuint:
 217       dst_types = [tfloat, tuint]
 218    elif src_t == tfloat:
 219       dst_types = [tint, tuint, tfloat, tbool]
 220
 221    for dst_t in dst_types:
 222       for dst_bit_size in type_sizes(dst_t):
 223           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 224               rnd_modes = ['_rtne', '_rtz', '']
 225               for rnd_mode in rnd_modes:
 226                   if rnd_mode == '_rtne':
 227                       conv_expr = """
 228                       if (bit_size > 16) {
 229                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 230                       } else {
 231                          dst = src0;
 232                       }
 233                       """
 234                   elif rnd_mode == '_rtz':
 235                       conv_expr = """
 236                       if (bit_size > 16) {
 237                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 238                       } else {
 239                          dst = src0;
 240                       }
 241                       """
 242                   else:
 243                       conv_expr = "src0"
 244
 245                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 246                                                               dst_t[0],
 247                                                               dst_bit_size,
 248                                                               rnd_mode),
 249                                        dst_t + str(dst_bit_size),
 250                                        src_t, conv_expr)
 251           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 252               conv_expr = """
 253               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 254                  dst = _mesa_double_to_float_rtz(src0);
 255               } else {
 256                  dst = src0;
 257               }
 258               """
 259               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 260                                                        dst_bit_size),
 261                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 262           else:
 263               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 264               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 265                                                        dst_bit_size),
 266                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 267
 268
 269 # Unary floating-point rounding operations.
 270
 271
 272 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 273 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 274 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 275 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 276 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 277
 278 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 279
 280 # Trigonometric operations.
 281
 282
 283 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 284 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 285
 286 # dfrexp
 287 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 288 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 289
 290 # Partial derivatives.
 291
 292
 293 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 294 unop("fddy", tfloat, "0.0")
 295 unop("fddx_fine", tfloat, "0.0")
 296 unop("fddy_fine", tfloat, "0.0")
 297 unop("fddx_coarse", tfloat, "0.0")
 298 unop("fddy_coarse", tfloat, "0.0")
 299
 300
 301 # Floating point pack and unpack operations.
 302
 303 def pack_2x16(fmt):
 304    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 305 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 306 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 307 """.replace("fmt", fmt))
 308
 309 def pack_4x8(fmt):
 310    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 311 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 313 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 314 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 315 """.replace("fmt", fmt))
 316
 317 def unpack_2x16(fmt):
 318    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 319 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 320 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 321 """.replace("fmt", fmt))
 322
 323 def unpack_4x8(fmt):
 324    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 325 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 326 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 327 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 328 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 329 """.replace("fmt", fmt))
 330
 331
 332 pack_2x16("snorm")
 333 pack_4x8("snorm")
 334 pack_2x16("unorm")
 335 pack_4x8("unorm")
 336 pack_2x16("half")
 337 unpack_2x16("snorm")
 338 unpack_4x8("snorm")
 339 unpack_2x16("unorm")
 340 unpack_4x8("unorm")
 341 unpack_2x16("half")
 342
 343 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 344 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 345 """)
 346
 347 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 348 dst.x = (src0.x <<  0) |
 349         (src0.y <<  8) |
 350         (src0.z << 16) |
 351         (src0.w << 24);
 352 """)
 353
 354 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 355            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 356
 357 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 358            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 359
 360 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 361            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 362
 363 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 364            "dst.x = src0.x; dst.y = src0.x >> 32;")
 365
 366 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 367            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 368
 369 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 370            "dst.x = src0.x; dst.y = src0.x >> 16;")
 371
 372 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 373 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 374 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 375 """)
 376
 377 # Lowered floating point unpacking operations.
 378
 379 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 380              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 381 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 382              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 383
 384 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 385              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 386 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 387              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 388
 389 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 390 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 391
 392 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 393 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 394
 395 # Bit operations, part of ARB_gpu_shader5.
 396
 397
 398 unop("bitfield_reverse", tuint32, """
 399 /* we're not winning any awards for speed here, but that's ok */
 400 dst = 0;
 401 for (unsigned bit = 0; bit < 32; bit++)
 402    dst |= ((src0 >> bit) & 1) << (31 - bit);
 403 """)
 404 unop_convert("bit_count", tuint32, tuint, """
 405 dst = 0;
 406 for (unsigned bit = 0; bit < bit_size; bit++) {
 407    if ((src0 >> bit) & 1)
 408       dst++;
 409 }
 410 """)
 411
 412 unop_convert("ufind_msb", tint32, tuint, """
 413 dst = -1;
 414 for (int bit = bit_size - 1; bit >= 0; bit--) {
 415    if ((src0 >> bit) & 1) {
 416       dst = bit;
 417       break;
 418    }
 419 }
 420 """)
 421
 422 unop("ifind_msb", tint32, """
 423 dst = -1;
 424 for (int bit = 31; bit >= 0; bit--) {
 425    /* If src0 < 0, we're looking for the first 0 bit.
 426     * if src0 >= 0, we're looking for the first 1 bit.
 427     */
 428    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 429       (!((src0 >> bit) & 1) && (src0 < 0))) {
 430       dst = bit;
 431       break;
 432    }
 433 }
 434 """)
 435
 436 unop_convert("find_lsb", tint32, tint, """
 437 dst = -1;
 438 for (unsigned bit = 0; bit < bit_size; bit++) {
 439    if ((src0 >> bit) & 1) {
 440       dst = bit;
 441       break;
 442    }
 443 }
 444 """)
 445
 446
 447 for i in range(1, 5):
 448    for j in range(1, 5):
 449       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 450
 451
 452 # AMD_gcn_shader extended instructions
 453 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 454 dst.x = dst.y = 0.0;
 455 float absX = fabs(src0.x);
 456 float absY = fabs(src0.y);
 457 float absZ = fabs(src0.z);
 458
 459 float ma = 0.0;
 460 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 461 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 462 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 463
 464 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 465 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 466 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 467 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 468 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 469 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 470
 471 dst.x = dst.x / ma + 0.5;
 472 dst.y = dst.y / ma + 0.5;
 473 """)
 474
 475 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 476 float absX = fabs(src0.x);
 477 float absY = fabs(src0.y);
 478 float absZ = fabs(src0.z);
 479 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 480 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 481 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 482 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 483 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 484 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 485 """)
 486
 487 # Sum of vector components
 488 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 489
 490 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 491    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 492           False, alg_props, const_expr)
 493
 494 def binop(name, ty, alg_props, const_expr):
 495    binop_convert(name, ty, ty, alg_props, const_expr)
 496
 497 def binop_compare(name, ty, alg_props, const_expr):
 498    binop_convert(name, tbool1, ty, alg_props, const_expr)
 499
 500 def binop_compare8(name, ty, alg_props, const_expr):
 501    binop_convert(name, tbool8, ty, alg_props, const_expr)
 502
 503 def binop_compare16(name, ty, alg_props, const_expr):
 504    binop_convert(name, tbool16, ty, alg_props, const_expr)
 505
 506 def binop_compare32(name, ty, alg_props, const_expr):
 507    binop_convert(name, tbool32, ty, alg_props, const_expr)
 508
 509 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 510    binop_compare(name, ty, alg_props, const_expr)
 511    binop_compare8(name + "8", ty, alg_props, const_expr)
 512    binop_compare16(name + "16", ty, alg_props, const_expr)
 513    binop_compare32(name + "32", ty, alg_props, const_expr)
 514
 515 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 516                 src2_type, const_expr):
 517    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 518           False, "", const_expr)
 519
 520 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 521                  reduce_expr, final_expr):
 522    def final(src):
 523       return final_expr.format(src= "(" + src + ")")
 524    def reduce_(src0, src1):
 525       return reduce_expr.format(src0=src0, src1=src1)
 526    def prereduce(src0, src1):
 527       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 528    src0 = prereduce("src0.x", "src1.x")
 529    src1 = prereduce("src0.y", "src1.y")
 530    src2 = prereduce("src0.z", "src1.z")
 531    src3 = prereduce("src0.w", "src1.w")
 532    opcode(name + "2", output_size, output_type,
 533           [2, 2], [src_type, src_type], False, _2src_commutative,
 534           final(reduce_(src0, src1)))
 535    opcode(name + "3", output_size, output_type,
 536           [3, 3], [src_type, src_type], False, _2src_commutative,
 537           final(reduce_(reduce_(src0, src1), src2)))
 538    opcode(name + "4", output_size, output_type,
 539           [4, 4], [src_type, src_type], False, _2src_commutative,
 540           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 541
 542 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 543                            reduce_expr, final_expr):
 544    binop_reduce(name, output_size, tbool1, src_type,
 545                 prereduce_expr, reduce_expr, final_expr)
 546    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
 547                 prereduce_expr, reduce_expr, final_expr)
 548    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 549                 prereduce_expr, reduce_expr, final_expr)
 550    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 551                 prereduce_expr, reduce_expr, final_expr)
 552
 553 binop("fadd", tfloat, _2src_commutative + associative,"""
 554 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 555    if (bit_size == 64)
 556       dst = _mesa_double_add_rtz(src0, src1);
 557    else
 558       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 559 } else {
 560    dst = src0 + src1;
 561 }
 562 """)
 563 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 564 binop("iadd_sat", tint, _2src_commutative, """
 565       src1 > 0 ?
 566          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 567          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 568 """)
 569 binop("uadd_sat", tuint, _2src_commutative,
 570       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 571 binop("isub_sat", tint, "", """
 572       src1 < 0 ?
 573          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 574          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 575 """)
 576 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 577
 578 binop("fsub", tfloat, "", """
 579 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 580    if (bit_size == 64)
 581       dst = _mesa_double_sub_rtz(src0, src1);
 582    else
 583       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 584 } else {
 585    dst = src0 - src1;
 586 }
 587 """)
 588 binop("isub", tint, "", "src0 - src1")
 589
 590 binop("fmul", tfloat, _2src_commutative + associative, """
 591 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 592    if (bit_size == 64)
 593       dst = _mesa_double_mul_rtz(src0, src1);
 594    else
 595       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 596 } else {
 597    dst = src0 * src1;
 598 }
 599 """)
 600 # low 32-bits of signed/unsigned integer multiply
 601 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 602
 603 # Generate 64 bit result from 2 32 bits quantity
 604 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 605               "(int64_t)src0 * (int64_t)src1")
 606 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 607               "(uint64_t)src0 * (uint64_t)src1")
 608
 609 # high 32-bits of signed integer multiply
 610 binop("imul_high", tint, _2src_commutative, """
 611 if (bit_size == 64) {
 612    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 613     * extension to work properly.  The casts are kind-of annoying but needed
 614     * to prevent compiler warnings.
 615     */
 616    uint32_t src0_u32[4] = {
 617       src0,
 618       (int64_t)src0 >> 32,
 619       (int64_t)src0 >> 63,
 620       (int64_t)src0 >> 63,
 621    };
 622    uint32_t src1_u32[4] = {
 623       src1,
 624       (int64_t)src1 >> 32,
 625       (int64_t)src1 >> 63,
 626       (int64_t)src1 >> 63,
 627    };
 628    uint32_t prod_u32[4];
 629    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 630    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 631 } else {
 632    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 633 }
 634 """)
 635
 636 # high 32-bits of unsigned integer multiply
 637 binop("umul_high", tuint, _2src_commutative, """
 638 if (bit_size == 64) {
 639    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 640    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 641    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 642    uint32_t prod_u32[4];
 643    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 644    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 645 } else {
 646    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 647 }
 648 """)
 649
 650 # low 32-bits of unsigned integer multiply
 651 binop("umul_low", tuint32, _2src_commutative, """
 652 uint64_t mask = (1 << (bit_size / 2)) - 1;
 653 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 654 """)
 655
 656
 657 binop("fdiv", tfloat, "", "src0 / src1")
 658 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 659 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 660
 661 # returns a boolean representing the carry resulting from the addition of
 662 # the two unsigned arguments.
 663
 664 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 665
 666 # returns a boolean representing the borrow resulting from the subtraction
 667 # of the two unsigned arguments.
 668
 669 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 670
 671 # hadd: (a + b) >> 1 (without overflow)
 672 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 673 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 674 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 675 #       =     ((x & y) << 1) + (x ^ y)
 676 #
 677 # Since we know that the bottom bit of (x & y) << 1 is zero,
 678 #
 679 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 680 #              =   (x & y) +      ((x ^ y)  >> 1)
 681 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 682 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 683
 684 # rhadd: (a + b + 1) >> 1 (without overflow)
 685 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 686 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 687 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 688 #           =     ((x | y) << 1) - (x ^ y) + 1
 689 #
 690 # Since we know that the bottom bit of (x & y) << 1 is zero,
 691 #
 692 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 693 #                  = (x | y) -  ((x ^ y)      >> 1)
 694 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 695 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 696
 697 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 698
 699 # For signed integers, there are several different possible definitions of
 700 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 701 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 702 # operation while the imod opcode implements the more mathematical
 703 # "modulus" operation.  For details on the difference, see
 704 #
 705 # http://mathforum.org/library/drmath/view/52343.html
 706
 707 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 708 binop("imod", tint, "",
 709       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 710       "                 src0 % src1 : src0 % src1 + src1)")
 711 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 712 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 713
 714 #
 715 # Comparisons
 716 #
 717
 718
 719 # these integer-aware comparisons return a boolean (0 or ~0)
 720
 721 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 722 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 723 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 724 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 725 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 726 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 727 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 728 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 729 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 730 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 731
 732 # integer-aware GLSL-style comparisons that compare floats and ints
 733
 734 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 735                        "{src0} && {src1}", "{src}")
 736 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 737                        "{src0} || {src1}", "{src}")
 738 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 739                        "{src0} && {src1}", "{src}")
 740 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 741                        "{src0} || {src1}", "{src}")
 742
 743 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 744
 745 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 746              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 747 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 748              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 749
 750 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 751 # and false respectively
 752
 753 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 754 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 755 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 756 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 757
 758 # SPIRV shifts are undefined for shift-operands >= bitsize,
 759 # but SM5 shifts are defined to use the least significant bits, only
 760 # The NIR definition is according to the SM5 specification.
 761 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 762        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 763 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 764        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 765 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 766        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 767
 768 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 769    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 770    dst = (src0 << (src1 & rotate_mask)) |
 771          (src0 >> (-src1 & rotate_mask));
 772 """)
 773 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 774    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 775    dst = (src0 >> (src1 & rotate_mask)) |
 776          (src0 << (-src1 & rotate_mask));
 777 """)
 778
 779 # bitwise logic operators
 780 #
 781 # These are also used as boolean and, or, xor for hardware supporting
 782 # integers.
 783
 784
 785 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 786 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 787 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 788
 789
 790 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 791              "{src}")
 792
 793 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 794              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 795
 796 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 797        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 798 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 799        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 800
 801 binop("fmin", tfloat, "", "fmin(src0, src1)")
 802 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 803 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 804 binop("fmax", tfloat, "", "fmax(src0, src1)")
 805 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 806 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 807
 808 # Saturated vector add for 4 8bit ints.
 809 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 810 dst = 0;
 811 for (int i = 0; i < 32; i += 8) {
 812    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 813 }
 814 """)
 815
 816 # Saturated vector subtract for 4 8bit ints.
 817 binop("ussub_4x8", tint32, "", """
 818 dst = 0;
 819 for (int i = 0; i < 32; i += 8) {
 820    int src0_chan = (src0 >> i) & 0xff;
 821    int src1_chan = (src1 >> i) & 0xff;
 822    if (src0_chan > src1_chan)
 823       dst |= (src0_chan - src1_chan) << i;
 824 }
 825 """)
 826
 827 # vector min for 4 8bit ints.
 828 binop("umin_4x8", tint32, _2src_commutative + associative, """
 829 dst = 0;
 830 for (int i = 0; i < 32; i += 8) {
 831    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 832 }
 833 """)
 834
 835 # vector max for 4 8bit ints.
 836 binop("umax_4x8", tint32, _2src_commutative + associative, """
 837 dst = 0;
 838 for (int i = 0; i < 32; i += 8) {
 839    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 840 }
 841 """)
 842
 843 # unorm multiply: (a * b) / 255.
 844 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 845 dst = 0;
 846 for (int i = 0; i < 32; i += 8) {
 847    int src0_chan = (src0 >> i) & 0xff;
 848    int src1_chan = (src1 >> i) & 0xff;
 849    dst |= ((src0_chan * src1_chan) / 255) << i;
 850 }
 851 """)
 852
 853 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 854
 855 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 856             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 857
 858 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 859               "src0 | ((uint64_t)src1 << 32)")
 860
 861 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 862               "src0 | ((uint32_t)src1 << 16)")
 863
 864 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 865 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 866 # are from the low five bits of src0 and src1, respectively.
 867 binop_convert("bfm", tuint32, tint32, "", """
 868 int bits = src0 & 0x1F;
 869 int offset = src1 & 0x1F;
 870 dst = ((1u << bits) - 1) << offset;
 871 """)
 872
 873 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 874 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 875 /* flush denormals to zero. */
 876 if (!isnormal(dst))
 877    dst = copysignf(0.0f, src0);
 878 """)
 879
 880 # Combines the first component of each input to make a 2-component vector.
 881
 882 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 883 dst.x = src0.x;
 884 dst.y = src1.x;
 885 """)
 886
 887 # Byte extraction
 888 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 889 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 890
 891 # Word extraction
 892 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 893 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 894
 895
 896 def triop(name, ty, alg_props, const_expr):
 897    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 898 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 899    opcode(name, output_size, tuint,
 900    [src1_size, src2_size, src3_size],
 901    [tuint, tuint, tuint], False, "", const_expr)
 902
 903 triop("ffma", tfloat, _2src_commutative, """
 904 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 905    if (bit_size == 64)
 906       dst = _mesa_double_fma_rtz(src0, src1, src2);
 907    else if (bit_size == 32)
 908       dst = _mesa_float_fma_rtz(src0, src1, src2);
 909    else
 910       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 911 } else {
 912    if (bit_size == 32)
 913       dst = fmaf(src0, src1, src2);
 914    else
 915       dst = fma(src0, src1, src2);
 916 }
 917 """)
 918
 919 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 920
 921 # Conditional Select
 922 #
 923 # A vector conditional select instruction (like ?:, but operating per-
 924 # component on vectors). There are two versions, one for floating point
 925 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 926
 927
 928 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 929
 930 # 3 way min/max/med
 931 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 932 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 933 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 934
 935 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 936 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 937 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 938
 939 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 940 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 941 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 942
 943 opcode("bcsel", 0, tuint, [0, 0, 0],
 944        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 945 opcode("b8csel", 0, tuint, [0, 0, 0],
 946        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
 947 opcode("b16csel", 0, tuint, [0, 0, 0],
 948        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 949 opcode("b32csel", 0, tuint, [0, 0, 0],
 950        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 951
 952 # SM5 bfi assembly
 953 triop("bfi", tuint32, "", """
 954 unsigned mask = src0, insert = src1, base = src2;
 955 if (mask == 0) {
 956    dst = base;
 957 } else {
 958    unsigned tmp = mask;
 959    while (!(tmp & 1)) {
 960       tmp >>= 1;
 961       insert <<= 1;
 962    }
 963    dst = (base & ~mask) | (insert & mask);
 964 }
 965 """)
 966
 967
 968 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 969
 970 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 971 opcode("ubfe", 0, tuint32,
 972        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 973 unsigned base = src0;
 974 unsigned offset = src1 & 0x1F;
 975 unsigned bits = src2 & 0x1F;
 976 if (bits == 0) {
 977    dst = 0;
 978 } else if (offset + bits < 32) {
 979    dst = (base << (32 - bits - offset)) >> (32 - bits);
 980 } else {
 981    dst = base >> offset;
 982 }
 983 """)
 984 opcode("ibfe", 0, tint32,
 985        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
 986 int base = src0;
 987 unsigned offset = src1 & 0x1F;
 988 unsigned bits = src2 & 0x1F;
 989 if (bits == 0) {
 990    dst = 0;
 991 } else if (offset + bits < 32) {
 992    dst = (base << (32 - bits - offset)) >> (32 - bits);
 993 } else {
 994    dst = base >> offset;
 995 }
 996 """)
 997
 998 # GLSL bitfieldExtract()
 999 opcode("ubitfield_extract", 0, tuint32,
1000        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1001 unsigned base = src0;
1002 int offset = src1, bits = src2;
1003 if (bits == 0) {
1004    dst = 0;
1005 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1006    dst = 0; /* undefined per the spec */
1007 } else {
1008    dst = (base >> offset) & ((1ull << bits) - 1);
1009 }
1010 """)
1011 opcode("ibitfield_extract", 0, tint32,
1012        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1013 int base = src0;
1014 int offset = src1, bits = src2;
1015 if (bits == 0) {
1016    dst = 0;
1017 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1018    dst = 0;
1019 } else {
1020    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1021 }
1022 """)
1023
1024 # Combines the first component of each input to make a 3-component vector.
1025
1026 triop_horiz("vec3", 3, 1, 1, 1, """
1027 dst.x = src0.x;
1028 dst.y = src1.x;
1029 dst.z = src2.x;
1030 """)
1031
1032 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1033                  src4_size, const_expr):
1034    opcode(name, output_size, tuint,
1035           [src1_size, src2_size, src3_size, src4_size],
1036           [tuint, tuint, tuint, tuint],
1037           False, "", const_expr)
1038
1039 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1040        [tuint32, tuint32, tint32, tint32], False, "", """
1041 unsigned base = src0, insert = src1;
1042 int offset = src2, bits = src3;
1043 if (bits == 0) {
1044    dst = base;
1045 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1046    dst = 0;
1047 } else {
1048    unsigned mask = ((1ull << bits) - 1) << offset;
1049    dst = (base & ~mask) | ((insert << offset) & mask);
1050 }
1051 """)
1052
1053 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1054 dst.x = src0.x;
1055 dst.y = src1.x;
1056 dst.z = src2.x;
1057 dst.w = src3.x;
1058 """)
1059
1060 # An integer multiply instruction for address calculation.  This is
1061 # similar to imul, except that the results are undefined in case of
1062 # overflow.  Overflow is defined according to the size of the variable
1063 # being dereferenced.
1064 #
1065 # This relaxed definition, compared to imul, allows an optimization
1066 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1067 # sources, such that lower precision integer multiplies can be used.
1068 # This is useful on hw that has 24b or perhaps 16b integer multiply
1069 # instructions.
1070 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1071
1072 # ir3-specific instruction that maps directly to mul-add shift high mix,
1073 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1074 # multiplication (imul) on Freedreno backend..
1075 opcode("imadsh_mix16", 1, tint32,
1076        [1, 1, 1], [tint32, tint32, tint32], False, "", """
1077 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1078 """)
1079
1080 # ir3-specific instruction that maps directly to ir3 mad.s24.
1081 #
1082 # 24b multiply into 32b result (with sign extension) plus 32b int
1083 triop("imad24_ir3", tint32, _2src_commutative,
1084       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1085
1086 # 24b multiply into 32b result (with sign extension)
1087 binop("imul24", tint32, _2src_commutative + associative,
1088       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")