src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool8 = "bool8"
  99 tbool16 = "bool16"
 100 tbool32 = "bool32"
 101 tuint = "uint"
 102 tuint16 = "uint16"
 103 tfloat16 = "float16"
 104 tfloat32 = "float32"
 105 tint32 = "int32"
 106 tuint32 = "uint32"
 107 tint64 = "int64"
 108 tuint64 = "uint64"
 109 tfloat64 = "float64"
 110
 111 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 112
 113 def type_has_size(type_):
 114     m = _TYPE_SPLIT_RE.match(type_)
 115     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 116     return m.group('bits') is not None
 117
 118 def type_size(type_):
 119     m = _TYPE_SPLIT_RE.match(type_)
 120     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 121     assert m.group('bits') is not None, \
 122            'NIR type string has no bit size: "{}"'.format(type_)
 123     return int(m.group('bits'))
 124
 125 def type_sizes(type_):
 126     if type_has_size(type_):
 127         return [type_size(type_)]
 128     elif type_ == 'bool':
 129         return [1, 8, 16, 32]
 130     elif type_ == 'float':
 131         return [16, 32, 64]
 132     else:
 133         return [1, 8, 16, 32, 64]
 134
 135 def type_base_type(type_):
 136     m = _TYPE_SPLIT_RE.match(type_)
 137     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 138     return m.group('type')
 139
 140 # Operation where the first two sources are commutative.
 141 #
 142 # For 2-source operations, this just mathematical commutativity.  Some
 143 # 3-source operations, like ffma, are only commutative in the first two
 144 # sources.
 145 _2src_commutative = "2src_commutative "
 146 associative = "associative "
 147
 148 # global dictionary of opcodes
 149 opcodes = {}
 150
 151 def opcode(name, output_size, output_type, input_sizes, input_types,
 152            is_conversion, algebraic_properties, const_expr):
 153    assert name not in opcodes
 154    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 155                           input_types, is_conversion, algebraic_properties,
 156                           const_expr)
 157
 158 def unop_convert(name, out_type, in_type, const_expr):
 159    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 160
 161 def unop(name, ty, const_expr):
 162    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 163
 164 def unop_horiz(name, output_size, output_type, input_size, input_type,
 165                const_expr):
 166    opcode(name, output_size, output_type, [input_size], [input_type],
 167           False, "", const_expr)
 168
 169 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 170                 reduce_expr, final_expr):
 171    def prereduce(src):
 172       return "(" + prereduce_expr.format(src=src) + ")"
 173    def final(src):
 174       return final_expr.format(src="(" + src + ")")
 175    def reduce_(src0, src1):
 176       return reduce_expr.format(src0=src0, src1=src1)
 177    src0 = prereduce("src0.x")
 178    src1 = prereduce("src0.y")
 179    src2 = prereduce("src0.z")
 180    src3 = prereduce("src0.w")
 181    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 182               final(reduce_(src0, src1)))
 183    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 184               final(reduce_(reduce_(src0, src1), src2)))
 185    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 186               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 187
 188 def unop_numeric_convert(name, out_type, in_type, const_expr):
 189    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 190
 191 unop("mov", tuint, "src0")
 192
 193 unop("ineg", tint, "-src0")
 194 unop("fneg", tfloat, "-src0")
 195 unop("inot", tint, "~src0") # invert every bit of the integer
 196 unop("fsign", tfloat, ("bit_size == 64 ? " +
 197                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 198                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 199 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 200 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 201 unop("fabs", tfloat, "fabs(src0)")
 202 unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
 203 unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
 204 unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)"))
 205 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 206 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 207 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 208 unop("fexp2", tfloat, "exp2f(src0)")
 209 unop("flog2", tfloat, "log2f(src0)")
 210
 211 # Generate all of the numeric conversion opcodes
 212 for src_t in [tint, tuint, tfloat, tbool]:
 213    if src_t == tbool:
 214       dst_types = [tfloat, tint, tbool]
 215    elif src_t == tint:
 216       dst_types = [tfloat, tint, tbool]
 217    elif src_t == tuint:
 218       dst_types = [tfloat, tuint]
 219    elif src_t == tfloat:
 220       dst_types = [tint, tuint, tfloat, tbool]
 221
 222    for dst_t in dst_types:
 223       for dst_bit_size in type_sizes(dst_t):
 224           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 225               rnd_modes = ['_rtne', '_rtz', '']
 226               for rnd_mode in rnd_modes:
 227                   if rnd_mode == '_rtne':
 228                       conv_expr = """
 229                       if (bit_size > 16) {
 230                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 231                       } else {
 232                          dst = src0;
 233                       }
 234                       """
 235                   elif rnd_mode == '_rtz':
 236                       conv_expr = """
 237                       if (bit_size > 16) {
 238                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 239                       } else {
 240                          dst = src0;
 241                       }
 242                       """
 243                   else:
 244                       conv_expr = "src0"
 245
 246                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 247                                                               dst_t[0],
 248                                                               dst_bit_size,
 249                                                               rnd_mode),
 250                                        dst_t + str(dst_bit_size),
 251                                        src_t, conv_expr)
 252           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 253               conv_expr = """
 254               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 255                  dst = _mesa_double_to_float_rtz(src0);
 256               } else {
 257                  dst = src0;
 258               }
 259               """
 260               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 261                                                        dst_bit_size),
 262                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 263           else:
 264               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 265               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 266                                                        dst_bit_size),
 267                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 268
 269 # Special opcode that is the same as f2f16 except that it is safe to remove it
 270 # if the result is immediately converted back to float32 again. This is
 271 # generated as part of the precision lowering pass. mp stands for medium
 272 # precision.
 273 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
 274
 275 # Unary floating-point rounding operations.
 276
 277
 278 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 279 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 280 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 281 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 282 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 283
 284 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 285
 286 # Trigonometric operations.
 287
 288
 289 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 290 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 291
 292 # dfrexp
 293 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 294 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 295
 296 # Partial derivatives.
 297
 298
 299 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 300 unop("fddy", tfloat, "0.0")
 301 unop("fddx_fine", tfloat, "0.0")
 302 unop("fddy_fine", tfloat, "0.0")
 303 unop("fddx_coarse", tfloat, "0.0")
 304 unop("fddy_coarse", tfloat, "0.0")
 305
 306
 307 # Floating point pack and unpack operations.
 308
 309 def pack_2x16(fmt):
 310    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 311 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 312 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 313 """.replace("fmt", fmt))
 314
 315 def pack_4x8(fmt):
 316    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 317 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 318 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 319 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 320 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 321 """.replace("fmt", fmt))
 322
 323 def unpack_2x16(fmt):
 324    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 325 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 326 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 327 """.replace("fmt", fmt))
 328
 329 def unpack_4x8(fmt):
 330    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 331 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 332 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 333 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 334 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 335 """.replace("fmt", fmt))
 336
 337
 338 pack_2x16("snorm")
 339 pack_4x8("snorm")
 340 pack_2x16("unorm")
 341 pack_4x8("unorm")
 342 pack_2x16("half")
 343 unpack_2x16("snorm")
 344 unpack_4x8("snorm")
 345 unpack_2x16("unorm")
 346 unpack_4x8("unorm")
 347 unpack_2x16("half")
 348
 349 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 350 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 351 """)
 352
 353 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 354 dst.x = (src0.x <<  0) |
 355         (src0.y <<  8) |
 356         (src0.z << 16) |
 357         (src0.w << 24);
 358 """)
 359
 360 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 361            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 362
 363 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 364            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 365
 366 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 367            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 368
 369 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 370            "dst.x = src0.x; dst.y = src0.x >> 32;")
 371
 372 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 373            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 374
 375 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 376            "dst.x = src0.x; dst.y = src0.x >> 16;")
 377
 378 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 379 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 380 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 381 """)
 382
 383 # Lowered floating point unpacking operations.
 384
 385 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 386              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 387 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 388              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 389
 390 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 391              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 392 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 393              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 394
 395 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 396 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 397
 398 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 399 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 400
 401 # Bit operations, part of ARB_gpu_shader5.
 402
 403
 404 unop("bitfield_reverse", tuint32, """
 405 /* we're not winning any awards for speed here, but that's ok */
 406 dst = 0;
 407 for (unsigned bit = 0; bit < 32; bit++)
 408    dst |= ((src0 >> bit) & 1) << (31 - bit);
 409 """)
 410 unop_convert("bit_count", tuint32, tuint, """
 411 dst = 0;
 412 for (unsigned bit = 0; bit < bit_size; bit++) {
 413    if ((src0 >> bit) & 1)
 414       dst++;
 415 }
 416 """)
 417
 418 unop_convert("ufind_msb", tint32, tuint, """
 419 dst = -1;
 420 for (int bit = bit_size - 1; bit >= 0; bit--) {
 421    if ((src0 >> bit) & 1) {
 422       dst = bit;
 423       break;
 424    }
 425 }
 426 """)
 427
 428 unop("uclz", tuint32, """
 429 int bit;
 430 for (bit = bit_size - 1; bit >= 0; bit--) {
 431    if ((src0 & (1u << bit)) != 0)
 432       break;
 433 }
 434 dst = (unsigned)(31 - bit);
 435 """)
 436
 437 unop("ifind_msb", tint32, """
 438 dst = -1;
 439 for (int bit = 31; bit >= 0; bit--) {
 440    /* If src0 < 0, we're looking for the first 0 bit.
 441     * if src0 >= 0, we're looking for the first 1 bit.
 442     */
 443    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 444       (!((src0 >> bit) & 1) && (src0 < 0))) {
 445       dst = bit;
 446       break;
 447    }
 448 }
 449 """)
 450
 451 unop_convert("find_lsb", tint32, tint, """
 452 dst = -1;
 453 for (unsigned bit = 0; bit < bit_size; bit++) {
 454    if ((src0 >> bit) & 1) {
 455       dst = bit;
 456       break;
 457    }
 458 }
 459 """)
 460
 461 # AMD_gcn_shader extended instructions
 462 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 463 dst.x = dst.y = 0.0;
 464 float absX = fabsf(src0.x);
 465 float absY = fabsf(src0.y);
 466 float absZ = fabsf(src0.z);
 467
 468 float ma = 0.0;
 469 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 470 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 471 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 472
 473 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 474 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 475 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 476 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 477 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 478 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 479
 480 dst.x = dst.x / ma + 0.5;
 481 dst.y = dst.y / ma + 0.5;
 482 """)
 483
 484 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 485 float absX = fabsf(src0.x);
 486 float absY = fabsf(src0.y);
 487 float absZ = fabsf(src0.z);
 488 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 489 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 490 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 491 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 492 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 493 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 494 """)
 495
 496 # Sum of vector components
 497 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 498
 499 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 500    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 501           False, alg_props, const_expr)
 502
 503 def binop(name, ty, alg_props, const_expr):
 504    binop_convert(name, ty, ty, alg_props, const_expr)
 505
 506 def binop_compare(name, ty, alg_props, const_expr):
 507    binop_convert(name, tbool1, ty, alg_props, const_expr)
 508
 509 def binop_compare8(name, ty, alg_props, const_expr):
 510    binop_convert(name, tbool8, ty, alg_props, const_expr)
 511
 512 def binop_compare16(name, ty, alg_props, const_expr):
 513    binop_convert(name, tbool16, ty, alg_props, const_expr)
 514
 515 def binop_compare32(name, ty, alg_props, const_expr):
 516    binop_convert(name, tbool32, ty, alg_props, const_expr)
 517
 518 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 519    binop_compare(name, ty, alg_props, const_expr)
 520    binop_compare8(name + "8", ty, alg_props, const_expr)
 521    binop_compare16(name + "16", ty, alg_props, const_expr)
 522    binop_compare32(name + "32", ty, alg_props, const_expr)
 523
 524 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 525                 src2_type, const_expr):
 526    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 527           False, "", const_expr)
 528
 529 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 530                  reduce_expr, final_expr):
 531    def final(src):
 532       return final_expr.format(src= "(" + src + ")")
 533    def reduce_(src0, src1):
 534       return reduce_expr.format(src0=src0, src1=src1)
 535    def prereduce(src0, src1):
 536       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 537    src0 = prereduce("src0.x", "src1.x")
 538    src1 = prereduce("src0.y", "src1.y")
 539    src2 = prereduce("src0.z", "src1.z")
 540    src3 = prereduce("src0.w", "src1.w")
 541    opcode(name + "2", output_size, output_type,
 542           [2, 2], [src_type, src_type], False, _2src_commutative,
 543           final(reduce_(src0, src1)))
 544    opcode(name + "3", output_size, output_type,
 545           [3, 3], [src_type, src_type], False, _2src_commutative,
 546           final(reduce_(reduce_(src0, src1), src2)))
 547    opcode(name + "4", output_size, output_type,
 548           [4, 4], [src_type, src_type], False, _2src_commutative,
 549           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 550
 551 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 552                            reduce_expr, final_expr):
 553    binop_reduce(name, output_size, tbool1, src_type,
 554                 prereduce_expr, reduce_expr, final_expr)
 555    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
 556                 prereduce_expr, reduce_expr, final_expr)
 557    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 558                 prereduce_expr, reduce_expr, final_expr)
 559    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 560                 prereduce_expr, reduce_expr, final_expr)
 561
 562 binop("fadd", tfloat, _2src_commutative + associative,"""
 563 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 564    if (bit_size == 64)
 565       dst = _mesa_double_add_rtz(src0, src1);
 566    else
 567       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 568 } else {
 569    dst = src0 + src1;
 570 }
 571 """)
 572 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 573 binop("iadd_sat", tint, _2src_commutative, """
 574       src1 > 0 ?
 575          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 576          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 577 """)
 578 binop("uadd_sat", tuint, _2src_commutative,
 579       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 580 binop("isub_sat", tint, "", """
 581       src1 < 0 ?
 582          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 583          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 584 """)
 585 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 586
 587 binop("fsub", tfloat, "", """
 588 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 589    if (bit_size == 64)
 590       dst = _mesa_double_sub_rtz(src0, src1);
 591    else
 592       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 593 } else {
 594    dst = src0 - src1;
 595 }
 596 """)
 597 binop("isub", tint, "", "src0 - src1")
 598 binop_convert("uabs_isub", tuint, tint, "", """
 599               src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
 600                           : (uint64_t) src0 - (uint64_t) src1
 601 """)
 602 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
 603
 604 binop("fmul", tfloat, _2src_commutative + associative, """
 605 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 606    if (bit_size == 64)
 607       dst = _mesa_double_mul_rtz(src0, src1);
 608    else
 609       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 610 } else {
 611    dst = src0 * src1;
 612 }
 613 """)
 614 # low 32-bits of signed/unsigned integer multiply
 615 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 616
 617 # Generate 64 bit result from 2 32 bits quantity
 618 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 619               "(int64_t)src0 * (int64_t)src1")
 620 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 621               "(uint64_t)src0 * (uint64_t)src1")
 622
 623 # high 32-bits of signed integer multiply
 624 binop("imul_high", tint, _2src_commutative, """
 625 if (bit_size == 64) {
 626    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 627     * extension to work properly.  The casts are kind-of annoying but needed
 628     * to prevent compiler warnings.
 629     */
 630    uint32_t src0_u32[4] = {
 631       src0,
 632       (int64_t)src0 >> 32,
 633       (int64_t)src0 >> 63,
 634       (int64_t)src0 >> 63,
 635    };
 636    uint32_t src1_u32[4] = {
 637       src1,
 638       (int64_t)src1 >> 32,
 639       (int64_t)src1 >> 63,
 640       (int64_t)src1 >> 63,
 641    };
 642    uint32_t prod_u32[4];
 643    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 644    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 645 } else {
 646    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 647 }
 648 """)
 649
 650 # high 32-bits of unsigned integer multiply
 651 binop("umul_high", tuint, _2src_commutative, """
 652 if (bit_size == 64) {
 653    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 654    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 655    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 656    uint32_t prod_u32[4];
 657    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 658    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 659 } else {
 660    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 661 }
 662 """)
 663
 664 # low 32-bits of unsigned integer multiply
 665 binop("umul_low", tuint32, _2src_commutative, """
 666 uint64_t mask = (1 << (bit_size / 2)) - 1;
 667 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 668 """)
 669
 670 # Multiply 32-bits with low 16-bits.
 671 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
 672 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
 673
 674 binop("fdiv", tfloat, "", "src0 / src1")
 675 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 676 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 677
 678 # returns a boolean representing the carry resulting from the addition of
 679 # the two unsigned arguments.
 680
 681 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 682
 683 # returns a boolean representing the borrow resulting from the subtraction
 684 # of the two unsigned arguments.
 685
 686 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 687
 688 # hadd: (a + b) >> 1 (without overflow)
 689 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 690 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 691 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 692 #       =     ((x & y) << 1) + (x ^ y)
 693 #
 694 # Since we know that the bottom bit of (x & y) << 1 is zero,
 695 #
 696 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 697 #              =   (x & y) +      ((x ^ y)  >> 1)
 698 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 699 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 700
 701 # rhadd: (a + b + 1) >> 1 (without overflow)
 702 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 703 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 704 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 705 #           =     ((x | y) << 1) - (x ^ y) + 1
 706 #
 707 # Since we know that the bottom bit of (x & y) << 1 is zero,
 708 #
 709 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 710 #                  = (x | y) -  ((x ^ y)      >> 1)
 711 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 712 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 713
 714 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 715
 716 # For signed integers, there are several different possible definitions of
 717 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 718 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 719 # operation while the imod opcode implements the more mathematical
 720 # "modulus" operation.  For details on the difference, see
 721 #
 722 # http://mathforum.org/library/drmath/view/52343.html
 723
 724 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 725 binop("imod", tint, "",
 726       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 727       "                 src0 % src1 : src0 % src1 + src1)")
 728 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 729 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 730
 731 #
 732 # Comparisons
 733 #
 734
 735
 736 # these integer-aware comparisons return a boolean (0 or ~0)
 737
 738 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 739 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 740 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 741 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 742 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 743 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 744 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 745 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 746 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 747 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 748
 749 # integer-aware GLSL-style comparisons that compare floats and ints
 750
 751 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 752                        "{src0} && {src1}", "{src}")
 753 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 754                        "{src0} || {src1}", "{src}")
 755 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 756                        "{src0} && {src1}", "{src}")
 757 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 758                        "{src0} || {src1}", "{src}")
 759
 760 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 761
 762 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 763              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 764 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 765              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 766
 767 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 768 # and false respectively
 769
 770 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 771 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 772 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 773 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 774
 775 # SPIRV shifts are undefined for shift-operands >= bitsize,
 776 # but SM5 shifts are defined to use the least significant bits, only
 777 # The NIR definition is according to the SM5 specification.
 778 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 779        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 780 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 781        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 782 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 783        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 784
 785 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 786    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 787    dst = (src0 << (src1 & rotate_mask)) |
 788          (src0 >> (-src1 & rotate_mask));
 789 """)
 790 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 791    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 792    dst = (src0 >> (src1 & rotate_mask)) |
 793          (src0 << (-src1 & rotate_mask));
 794 """)
 795
 796 # bitwise logic operators
 797 #
 798 # These are also used as boolean and, or, xor for hardware supporting
 799 # integers.
 800
 801
 802 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 803 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 804 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 805
 806
 807 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 808              "{src}")
 809
 810 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 811              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 812
 813 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 814        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 815 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 816        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 817
 818 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
 819 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 820 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 821 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
 822 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 823 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 824
 825 # Saturated vector add for 4 8bit ints.
 826 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 827 dst = 0;
 828 for (int i = 0; i < 32; i += 8) {
 829    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 830 }
 831 """)
 832
 833 # Saturated vector subtract for 4 8bit ints.
 834 binop("ussub_4x8", tint32, "", """
 835 dst = 0;
 836 for (int i = 0; i < 32; i += 8) {
 837    int src0_chan = (src0 >> i) & 0xff;
 838    int src1_chan = (src1 >> i) & 0xff;
 839    if (src0_chan > src1_chan)
 840       dst |= (src0_chan - src1_chan) << i;
 841 }
 842 """)
 843
 844 # vector min for 4 8bit ints.
 845 binop("umin_4x8", tint32, _2src_commutative + associative, """
 846 dst = 0;
 847 for (int i = 0; i < 32; i += 8) {
 848    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 849 }
 850 """)
 851
 852 # vector max for 4 8bit ints.
 853 binop("umax_4x8", tint32, _2src_commutative + associative, """
 854 dst = 0;
 855 for (int i = 0; i < 32; i += 8) {
 856    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 857 }
 858 """)
 859
 860 # unorm multiply: (a * b) / 255.
 861 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 862 dst = 0;
 863 for (int i = 0; i < 32; i += 8) {
 864    int src0_chan = (src0 >> i) & 0xff;
 865    int src1_chan = (src1 >> i) & 0xff;
 866    dst |= ((src0_chan * src1_chan) / 255) << i;
 867 }
 868 """)
 869
 870 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 871
 872 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 873             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 874
 875 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 876               "src0 | ((uint64_t)src1 << 32)")
 877
 878 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 879               "src0 | ((uint32_t)src1 << 16)")
 880
 881 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 882 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 883 # are from the low five bits of src0 and src1, respectively.
 884 binop_convert("bfm", tuint32, tint32, "", """
 885 int bits = src0 & 0x1F;
 886 int offset = src1 & 0x1F;
 887 dst = ((1u << bits) - 1) << offset;
 888 """)
 889
 890 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 891 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 892 /* flush denormals to zero. */
 893 if (!isnormal(dst))
 894    dst = copysignf(0.0f, src0);
 895 """)
 896
 897 # Combines the first component of each input to make a 2-component vector.
 898
 899 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 900 dst.x = src0.x;
 901 dst.y = src1.x;
 902 """)
 903
 904 # Byte extraction
 905 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 906 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 907
 908 # Word extraction
 909 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 910 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 911
 912
 913 def triop(name, ty, alg_props, const_expr):
 914    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 915 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 916    opcode(name, output_size, tuint,
 917    [src1_size, src2_size, src3_size],
 918    [tuint, tuint, tuint], False, "", const_expr)
 919
 920 triop("ffma", tfloat, _2src_commutative, """
 921 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 922    if (bit_size == 64)
 923       dst = _mesa_double_fma_rtz(src0, src1, src2);
 924    else if (bit_size == 32)
 925       dst = _mesa_float_fma_rtz(src0, src1, src2);
 926    else
 927       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 928 } else {
 929    if (bit_size == 32)
 930       dst = fmaf(src0, src1, src2);
 931    else
 932       dst = fma(src0, src1, src2);
 933 }
 934 """)
 935
 936 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 937
 938 # Conditional Select
 939 #
 940 # A vector conditional select instruction (like ?:, but operating per-
 941 # component on vectors). There are two versions, one for floating point
 942 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 943
 944
 945 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 946
 947 # 3 way min/max/med
 948 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 949 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 950 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 951
 952 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 953 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 954 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 955
 956 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 957 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 958 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 959
 960 opcode("bcsel", 0, tuint, [0, 0, 0],
 961        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 962 opcode("b8csel", 0, tuint, [0, 0, 0],
 963        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
 964 opcode("b16csel", 0, tuint, [0, 0, 0],
 965        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 966 opcode("b32csel", 0, tuint, [0, 0, 0],
 967        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 968
 969 # SM5 bfi assembly
 970 triop("bfi", tuint32, "", """
 971 unsigned mask = src0, insert = src1, base = src2;
 972 if (mask == 0) {
 973    dst = base;
 974 } else {
 975    unsigned tmp = mask;
 976    while (!(tmp & 1)) {
 977       tmp >>= 1;
 978       insert <<= 1;
 979    }
 980    dst = (base & ~mask) | (insert & mask);
 981 }
 982 """)
 983
 984
 985 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 986
 987 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 988 opcode("ubfe", 0, tuint32,
 989        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 990 unsigned base = src0;
 991 unsigned offset = src1 & 0x1F;
 992 unsigned bits = src2 & 0x1F;
 993 if (bits == 0) {
 994    dst = 0;
 995 } else if (offset + bits < 32) {
 996    dst = (base << (32 - bits - offset)) >> (32 - bits);
 997 } else {
 998    dst = base >> offset;
 999 }
1000 """)
1001 opcode("ibfe", 0, tint32,
1002        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1003 int base = src0;
1004 unsigned offset = src1 & 0x1F;
1005 unsigned bits = src2 & 0x1F;
1006 if (bits == 0) {
1007    dst = 0;
1008 } else if (offset + bits < 32) {
1009    dst = (base << (32 - bits - offset)) >> (32 - bits);
1010 } else {
1011    dst = base >> offset;
1012 }
1013 """)
1014
1015 # GLSL bitfieldExtract()
1016 opcode("ubitfield_extract", 0, tuint32,
1017        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1018 unsigned base = src0;
1019 int offset = src1, bits = src2;
1020 if (bits == 0) {
1021    dst = 0;
1022 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1023    dst = 0; /* undefined per the spec */
1024 } else {
1025    dst = (base >> offset) & ((1ull << bits) - 1);
1026 }
1027 """)
1028 opcode("ibitfield_extract", 0, tint32,
1029        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1030 int base = src0;
1031 int offset = src1, bits = src2;
1032 if (bits == 0) {
1033    dst = 0;
1034 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1035    dst = 0;
1036 } else {
1037    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1038 }
1039 """)
1040
1041 # Combines the first component of each input to make a 3-component vector.
1042
1043 triop_horiz("vec3", 3, 1, 1, 1, """
1044 dst.x = src0.x;
1045 dst.y = src1.x;
1046 dst.z = src2.x;
1047 """)
1048
1049 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1050                  src4_size, const_expr):
1051    opcode(name, output_size, tuint,
1052           [src1_size, src2_size, src3_size, src4_size],
1053           [tuint, tuint, tuint, tuint],
1054           False, "", const_expr)
1055
1056 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1057        [tuint32, tuint32, tint32, tint32], False, "", """
1058 unsigned base = src0, insert = src1;
1059 int offset = src2, bits = src3;
1060 if (bits == 0) {
1061    dst = base;
1062 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1063    dst = 0;
1064 } else {
1065    unsigned mask = ((1ull << bits) - 1) << offset;
1066    dst = (base & ~mask) | ((insert << offset) & mask);
1067 }
1068 """)
1069
1070 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1071 dst.x = src0.x;
1072 dst.y = src1.x;
1073 dst.z = src2.x;
1074 dst.w = src3.x;
1075 """)
1076
1077 opcode("vec8", 8, tuint,
1078        [1] * 8, [tuint] * 8,
1079        False, "", """
1080 dst.x = src0.x;
1081 dst.y = src1.x;
1082 dst.z = src2.x;
1083 dst.w = src3.x;
1084 dst.e = src4.x;
1085 dst.f = src5.x;
1086 dst.g = src6.x;
1087 dst.h = src7.x;
1088 """)
1089
1090 opcode("vec16", 16, tuint,
1091        [1] * 16, [tuint] * 16,
1092        False, "", """
1093 dst.x = src0.x;
1094 dst.y = src1.x;
1095 dst.z = src2.x;
1096 dst.w = src3.x;
1097 dst.e = src4.x;
1098 dst.f = src5.x;
1099 dst.g = src6.x;
1100 dst.h = src7.x;
1101 dst.i = src8.x;
1102 dst.j = src9.x;
1103 dst.k = src10.x;
1104 dst.l = src11.x;
1105 dst.m = src12.x;
1106 dst.n = src13.x;
1107 dst.o = src14.x;
1108 dst.p = src15.x;
1109 """)
1110
1111 # An integer multiply instruction for address calculation.  This is
1112 # similar to imul, except that the results are undefined in case of
1113 # overflow.  Overflow is defined according to the size of the variable
1114 # being dereferenced.
1115 #
1116 # This relaxed definition, compared to imul, allows an optimization
1117 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1118 # sources, such that lower precision integer multiplies can be used.
1119 # This is useful on hw that has 24b or perhaps 16b integer multiply
1120 # instructions.
1121 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1122
1123 # ir3-specific instruction that maps directly to mul-add shift high mix,
1124 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1125 # multiplication (imul) on Freedreno backend..
1126 opcode("imadsh_mix16", 0, tint32,
1127        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1128 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1129 """)
1130
1131 # ir3-specific instruction that maps directly to ir3 mad.s24.
1132 #
1133 # 24b multiply into 32b result (with sign extension) plus 32b int
1134 triop("imad24_ir3", tint32, _2src_commutative,
1135       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1136
1137 # 24b multiply into 32b result (with sign extension)
1138 binop("imul24", tint32, _2src_commutative + associative,
1139       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1140
1141 # unsigned 24b multiply into 32b result plus 32b int
1142 triop("umad24", tuint32, _2src_commutative,
1143       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1144
1145 # unsigned 24b multiply into 32b result uint
1146 binop("umul24", tint32, _2src_commutative + associative,
1147       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")