src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool32 = "bool32"
  99 tuint = "uint"
 100 tuint16 = "uint16"
 101 tfloat32 = "float32"
 102 tint32 = "int32"
 103 tuint32 = "uint32"
 104 tint64 = "int64"
 105 tuint64 = "uint64"
 106 tfloat64 = "float64"
 107
 108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 109
 110 def type_has_size(type_):
 111     m = _TYPE_SPLIT_RE.match(type_)
 112     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 113     return m.group('bits') is not None
 114
 115 def type_size(type_):
 116     m = _TYPE_SPLIT_RE.match(type_)
 117     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 118     assert m.group('bits') is not None, \
 119            'NIR type string has no bit size: "{}"'.format(type_)
 120     return int(m.group('bits'))
 121
 122 def type_sizes(type_):
 123     if type_has_size(type_):
 124         return [type_size(type_)]
 125     elif type_ == 'bool':
 126         return [1, 32]
 127     elif type_ == 'float':
 128         return [16, 32, 64]
 129     else:
 130         return [1, 8, 16, 32, 64]
 131
 132 def type_base_type(type_):
 133     m = _TYPE_SPLIT_RE.match(type_)
 134     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 135     return m.group('type')
 136
 137 # Operation where the first two sources are commutative.
 138 #
 139 # For 2-source operations, this just mathematical commutativity.  Some
 140 # 3-source operations, like ffma, are only commutative in the first two
 141 # sources.
 142 _2src_commutative = "2src_commutative "
 143 associative = "associative "
 144
 145 # global dictionary of opcodes
 146 opcodes = {}
 147
 148 def opcode(name, output_size, output_type, input_sizes, input_types,
 149            is_conversion, algebraic_properties, const_expr):
 150    assert name not in opcodes
 151    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 152                           input_types, is_conversion, algebraic_properties,
 153                           const_expr)
 154
 155 def unop_convert(name, out_type, in_type, const_expr):
 156    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 157
 158 def unop(name, ty, const_expr):
 159    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 160
 161 def unop_horiz(name, output_size, output_type, input_size, input_type,
 162                const_expr):
 163    opcode(name, output_size, output_type, [input_size], [input_type],
 164           False, "", const_expr)
 165
 166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 167                 reduce_expr, final_expr):
 168    def prereduce(src):
 169       return "(" + prereduce_expr.format(src=src) + ")"
 170    def final(src):
 171       return final_expr.format(src="(" + src + ")")
 172    def reduce_(src0, src1):
 173       return reduce_expr.format(src0=src0, src1=src1)
 174    src0 = prereduce("src0.x")
 175    src1 = prereduce("src0.y")
 176    src2 = prereduce("src0.z")
 177    src3 = prereduce("src0.w")
 178    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 179               final(reduce_(src0, src1)))
 180    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 181               final(reduce_(reduce_(src0, src1), src2)))
 182    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 183               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 184
 185 def unop_numeric_convert(name, out_type, in_type, const_expr):
 186    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 187
 188 unop("mov", tuint, "src0")
 189
 190 unop("ineg", tint, "-src0")
 191 unop("fneg", tfloat, "-src0")
 192 unop("inot", tint, "~src0") # invert every bit of the integer
 193 unop("fsign", tfloat, ("bit_size == 64 ? " +
 194                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 195                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 198 unop("fabs", tfloat, "fabs(src0)")
 199 unop("fsat", tfloat, ("bit_size == 64 ? " +
 200                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 201                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 205 unop("fexp2", tfloat, "exp2f(src0)")
 206 unop("flog2", tfloat, "log2f(src0)")
 207
 208 # Generate all of the numeric conversion opcodes
 209 for src_t in [tint, tuint, tfloat, tbool]:
 210    if src_t == tbool:
 211       dst_types = [tfloat, tint]
 212    elif src_t == tint:
 213       dst_types = [tfloat, tint, tbool]
 214    elif src_t == tuint:
 215       dst_types = [tfloat, tuint]
 216    elif src_t == tfloat:
 217       dst_types = [tint, tuint, tfloat, tbool]
 218
 219    for dst_t in dst_types:
 220       for dst_bit_size in type_sizes(dst_t):
 221           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 222               rnd_modes = ['_rtne', '_rtz', '']
 223               for rnd_mode in rnd_modes:
 224                   if rnd_mode == '_rtne':
 225                       conv_expr = """
 226                       if (bit_size > 16) {
 227                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 228                       } else {
 229                          dst = src0;
 230                       }
 231                       """
 232                   elif rnd_mode == '_rtz':
 233                       conv_expr = """
 234                       if (bit_size > 16) {
 235                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 236                       } else {
 237                          dst = src0;
 238                       }
 239                       """
 240                   else:
 241                       conv_expr = "src0"
 242
 243                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 244                                                               dst_t[0],
 245                                                               dst_bit_size,
 246                                                               rnd_mode),
 247                                        dst_t + str(dst_bit_size),
 248                                        src_t, conv_expr)
 249           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 250               conv_expr = """
 251               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 252                  dst = _mesa_double_to_float_rtz(src0);
 253               } else {
 254                  dst = src0;
 255               }
 256               """
 257               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 258                                                        dst_bit_size),
 259                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 260           else:
 261               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 262               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 263                                                        dst_bit_size),
 264                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 265
 266
 267 # Unary floating-point rounding operations.
 268
 269
 270 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 271 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 272 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 273 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 274 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 275
 276 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 277
 278 # Trigonometric operations.
 279
 280
 281 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 282 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 283
 284 # dfrexp
 285 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 286 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 287
 288 # Partial derivatives.
 289
 290
 291 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 292 unop("fddy", tfloat, "0.0")
 293 unop("fddx_fine", tfloat, "0.0")
 294 unop("fddy_fine", tfloat, "0.0")
 295 unop("fddx_coarse", tfloat, "0.0")
 296 unop("fddy_coarse", tfloat, "0.0")
 297
 298
 299 # Floating point pack and unpack operations.
 300
 301 def pack_2x16(fmt):
 302    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 303 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 304 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 305 """.replace("fmt", fmt))
 306
 307 def pack_4x8(fmt):
 308    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 309 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 310 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 311 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 313 """.replace("fmt", fmt))
 314
 315 def unpack_2x16(fmt):
 316    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 317 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 318 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 319 """.replace("fmt", fmt))
 320
 321 def unpack_4x8(fmt):
 322    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 323 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 324 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 325 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 326 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 327 """.replace("fmt", fmt))
 328
 329
 330 pack_2x16("snorm")
 331 pack_4x8("snorm")
 332 pack_2x16("unorm")
 333 pack_4x8("unorm")
 334 pack_2x16("half")
 335 unpack_2x16("snorm")
 336 unpack_4x8("snorm")
 337 unpack_2x16("unorm")
 338 unpack_4x8("unorm")
 339 unpack_2x16("half")
 340
 341 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 342 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 343 """)
 344
 345 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 346 dst.x = (src0.x <<  0) |
 347         (src0.y <<  8) |
 348         (src0.z << 16) |
 349         (src0.w << 24);
 350 """)
 351
 352 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 353            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 354
 355 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 356            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 357
 358 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 359            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 360
 361 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 362            "dst.x = src0.x; dst.y = src0.x >> 32;")
 363
 364 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 365            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 366
 367 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 368            "dst.x = src0.x; dst.y = src0.x >> 16;")
 369
 370 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 371 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 372 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 373 """)
 374
 375 # Lowered floating point unpacking operations.
 376
 377 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 378              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 379 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 380              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 381
 382 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 383              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 384 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 385              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 386
 387 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 388 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 389
 390 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 391 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 392
 393 # Bit operations, part of ARB_gpu_shader5.
 394
 395
 396 unop("bitfield_reverse", tuint32, """
 397 /* we're not winning any awards for speed here, but that's ok */
 398 dst = 0;
 399 for (unsigned bit = 0; bit < 32; bit++)
 400    dst |= ((src0 >> bit) & 1) << (31 - bit);
 401 """)
 402 unop_convert("bit_count", tuint32, tuint, """
 403 dst = 0;
 404 for (unsigned bit = 0; bit < bit_size; bit++) {
 405    if ((src0 >> bit) & 1)
 406       dst++;
 407 }
 408 """)
 409
 410 unop_convert("ufind_msb", tint32, tuint, """
 411 dst = -1;
 412 for (int bit = bit_size - 1; bit >= 0; bit--) {
 413    if ((src0 >> bit) & 1) {
 414       dst = bit;
 415       break;
 416    }
 417 }
 418 """)
 419
 420 unop("ifind_msb", tint32, """
 421 dst = -1;
 422 for (int bit = 31; bit >= 0; bit--) {
 423    /* If src0 < 0, we're looking for the first 0 bit.
 424     * if src0 >= 0, we're looking for the first 1 bit.
 425     */
 426    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 427       (!((src0 >> bit) & 1) && (src0 < 0))) {
 428       dst = bit;
 429       break;
 430    }
 431 }
 432 """)
 433
 434 unop_convert("find_lsb", tint32, tint, """
 435 dst = -1;
 436 for (unsigned bit = 0; bit < bit_size; bit++) {
 437    if ((src0 >> bit) & 1) {
 438       dst = bit;
 439       break;
 440    }
 441 }
 442 """)
 443
 444
 445 for i in range(1, 5):
 446    for j in range(1, 5):
 447       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 448
 449
 450 # AMD_gcn_shader extended instructions
 451 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 452 dst.x = dst.y = 0.0;
 453 float absX = fabs(src0.x);
 454 float absY = fabs(src0.y);
 455 float absZ = fabs(src0.z);
 456
 457 float ma = 0.0;
 458 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 459 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 460 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 461
 462 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 463 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 464 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 465 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 466 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 467 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 468
 469 dst.x = dst.x / ma + 0.5;
 470 dst.y = dst.y / ma + 0.5;
 471 """)
 472
 473 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 474 float absX = fabs(src0.x);
 475 float absY = fabs(src0.y);
 476 float absZ = fabs(src0.z);
 477 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 478 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 479 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 480 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 481 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 482 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 483 """)
 484
 485 # Sum of vector components
 486 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 487
 488 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 489    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 490           False, alg_props, const_expr)
 491
 492 def binop(name, ty, alg_props, const_expr):
 493    binop_convert(name, ty, ty, alg_props, const_expr)
 494
 495 def binop_compare(name, ty, alg_props, const_expr):
 496    binop_convert(name, tbool1, ty, alg_props, const_expr)
 497
 498 def binop_compare32(name, ty, alg_props, const_expr):
 499    binop_convert(name, tbool32, ty, alg_props, const_expr)
 500
 501 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 502    binop_compare(name, ty, alg_props, const_expr)
 503    binop_compare32(name + "32", ty, alg_props, const_expr)
 504
 505 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 506                 src2_type, const_expr):
 507    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 508           False, "", const_expr)
 509
 510 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 511                  reduce_expr, final_expr):
 512    def final(src):
 513       return final_expr.format(src= "(" + src + ")")
 514    def reduce_(src0, src1):
 515       return reduce_expr.format(src0=src0, src1=src1)
 516    def prereduce(src0, src1):
 517       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 518    src0 = prereduce("src0.x", "src1.x")
 519    src1 = prereduce("src0.y", "src1.y")
 520    src2 = prereduce("src0.z", "src1.z")
 521    src3 = prereduce("src0.w", "src1.w")
 522    opcode(name + "2", output_size, output_type,
 523           [2, 2], [src_type, src_type], False, _2src_commutative,
 524           final(reduce_(src0, src1)))
 525    opcode(name + "3", output_size, output_type,
 526           [3, 3], [src_type, src_type], False, _2src_commutative,
 527           final(reduce_(reduce_(src0, src1), src2)))
 528    opcode(name + "4", output_size, output_type,
 529           [4, 4], [src_type, src_type], False, _2src_commutative,
 530           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 531
 532 binop("fadd", tfloat, _2src_commutative + associative,"""
 533 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 534    if (bit_size == 64)
 535       dst = _mesa_double_add_rtz(src0, src1);
 536    else
 537       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 538 } else {
 539    dst = src0 + src1;
 540 }
 541 """)
 542 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 543 binop("iadd_sat", tint, _2src_commutative, """
 544       src1 > 0 ?
 545          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 546          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 547 """)
 548 binop("uadd_sat", tuint, _2src_commutative,
 549       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 550 binop("isub_sat", tint, "", """
 551       src1 < 0 ?
 552          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 553          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 554 """)
 555 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 556
 557 binop("fsub", tfloat, "", """
 558 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 559    if (bit_size == 64)
 560       dst = _mesa_double_sub_rtz(src0, src1);
 561    else
 562       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 563 } else {
 564    dst = src0 - src1;
 565 }
 566 """)
 567 binop("isub", tint, "", "src0 - src1")
 568
 569 binop("fmul", tfloat, _2src_commutative + associative, """
 570 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 571    if (bit_size == 64)
 572       dst = _mesa_double_mul_rtz(src0, src1);
 573    else
 574       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 575 } else {
 576    dst = src0 * src1;
 577 }
 578 """)
 579 # low 32-bits of signed/unsigned integer multiply
 580 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 581
 582 # Generate 64 bit result from 2 32 bits quantity
 583 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 584               "(int64_t)src0 * (int64_t)src1")
 585 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 586               "(uint64_t)src0 * (uint64_t)src1")
 587
 588 # high 32-bits of signed integer multiply
 589 binop("imul_high", tint, _2src_commutative, """
 590 if (bit_size == 64) {
 591    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 592     * extension to work properly.  The casts are kind-of annoying but needed
 593     * to prevent compiler warnings.
 594     */
 595    uint32_t src0_u32[4] = {
 596       src0,
 597       (int64_t)src0 >> 32,
 598       (int64_t)src0 >> 63,
 599       (int64_t)src0 >> 63,
 600    };
 601    uint32_t src1_u32[4] = {
 602       src1,
 603       (int64_t)src1 >> 32,
 604       (int64_t)src1 >> 63,
 605       (int64_t)src1 >> 63,
 606    };
 607    uint32_t prod_u32[4];
 608    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 609    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 610 } else {
 611    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 612 }
 613 """)
 614
 615 # high 32-bits of unsigned integer multiply
 616 binop("umul_high", tuint, _2src_commutative, """
 617 if (bit_size == 64) {
 618    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 619    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 620    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 621    uint32_t prod_u32[4];
 622    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 623    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 624 } else {
 625    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 626 }
 627 """)
 628
 629 # low 32-bits of unsigned integer multiply
 630 binop("umul_low", tuint32, _2src_commutative, """
 631 uint64_t mask = (1 << (bit_size / 2)) - 1;
 632 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 633 """)
 634
 635
 636 binop("fdiv", tfloat, "", "src0 / src1")
 637 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 638 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 639
 640 # returns a boolean representing the carry resulting from the addition of
 641 # the two unsigned arguments.
 642
 643 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 644
 645 # returns a boolean representing the borrow resulting from the subtraction
 646 # of the two unsigned arguments.
 647
 648 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 649
 650 # hadd: (a + b) >> 1 (without overflow)
 651 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 652 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 653 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 654 #       =     ((x & y) << 1) + (x ^ y)
 655 #
 656 # Since we know that the bottom bit of (x & y) << 1 is zero,
 657 #
 658 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 659 #              =   (x & y) +      ((x ^ y)  >> 1)
 660 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 661 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 662
 663 # rhadd: (a + b + 1) >> 1 (without overflow)
 664 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 665 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 666 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 667 #           =     ((x | y) << 1) - (x ^ y) + 1
 668 #
 669 # Since we know that the bottom bit of (x & y) << 1 is zero,
 670 #
 671 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 672 #                  = (x | y) -  ((x ^ y)      >> 1)
 673 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 674 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 675
 676 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 677
 678 # For signed integers, there are several different possible definitions of
 679 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 680 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 681 # operation while the imod opcode implements the more mathematical
 682 # "modulus" operation.  For details on the difference, see
 683 #
 684 # http://mathforum.org/library/drmath/view/52343.html
 685
 686 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 687 binop("imod", tint, "",
 688       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 689       "                 src0 % src1 : src0 % src1 + src1)")
 690 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 691 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 692
 693 #
 694 # Comparisons
 695 #
 696
 697
 698 # these integer-aware comparisons return a boolean (0 or ~0)
 699
 700 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 701 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 702 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 703 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 704 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 705 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 706 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 707 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 708 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 709 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 710
 711 # integer-aware GLSL-style comparisons that compare floats and ints
 712
 713 binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
 714              "{src0} && {src1}", "{src}")
 715 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
 716              "{src0} || {src1}", "{src}")
 717 binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
 718              "{src0} && {src1}", "{src}")
 719 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
 720              "{src0} || {src1}", "{src}")
 721
 722 binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 723              "{src0} && {src1}", "{src}")
 724 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 725              "{src0} || {src1}", "{src}")
 726 binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
 727              "{src0} && {src1}", "{src}")
 728 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
 729              "{src0} || {src1}", "{src}")
 730
 731 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 732
 733 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 734              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 735 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 736              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 737
 738 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 739 # and false respectively
 740
 741 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 742 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 743 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 744 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 745
 746 # SPIRV shifts are undefined for shift-operands >= bitsize,
 747 # but SM5 shifts are defined to use the least significant bits, only
 748 # The NIR definition is according to the SM5 specification.
 749 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 750        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 751 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 752        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 753 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 754        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 755
 756 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 757    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 758    dst = (src0 << (src1 & rotate_mask)) |
 759          (src0 >> (-src1 & rotate_mask));
 760 """)
 761 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 762    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 763    dst = (src0 >> (src1 & rotate_mask)) |
 764          (src0 << (-src1 & rotate_mask));
 765 """)
 766
 767 # bitwise logic operators
 768 #
 769 # These are also used as boolean and, or, xor for hardware supporting
 770 # integers.
 771
 772
 773 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 774 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 775 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 776
 777
 778 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 779              "{src}")
 780
 781 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 782              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 783
 784 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 785        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 786 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 787        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 788
 789 binop("fmin", tfloat, "", "fmin(src0, src1)")
 790 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 791 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 792 binop("fmax", tfloat, "", "fmax(src0, src1)")
 793 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 794 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 795
 796 # Saturated vector add for 4 8bit ints.
 797 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 798 dst = 0;
 799 for (int i = 0; i < 32; i += 8) {
 800    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 801 }
 802 """)
 803
 804 # Saturated vector subtract for 4 8bit ints.
 805 binop("ussub_4x8", tint32, "", """
 806 dst = 0;
 807 for (int i = 0; i < 32; i += 8) {
 808    int src0_chan = (src0 >> i) & 0xff;
 809    int src1_chan = (src1 >> i) & 0xff;
 810    if (src0_chan > src1_chan)
 811       dst |= (src0_chan - src1_chan) << i;
 812 }
 813 """)
 814
 815 # vector min for 4 8bit ints.
 816 binop("umin_4x8", tint32, _2src_commutative + associative, """
 817 dst = 0;
 818 for (int i = 0; i < 32; i += 8) {
 819    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 820 }
 821 """)
 822
 823 # vector max for 4 8bit ints.
 824 binop("umax_4x8", tint32, _2src_commutative + associative, """
 825 dst = 0;
 826 for (int i = 0; i < 32; i += 8) {
 827    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 828 }
 829 """)
 830
 831 # unorm multiply: (a * b) / 255.
 832 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 833 dst = 0;
 834 for (int i = 0; i < 32; i += 8) {
 835    int src0_chan = (src0 >> i) & 0xff;
 836    int src1_chan = (src1 >> i) & 0xff;
 837    dst |= ((src0_chan * src1_chan) / 255) << i;
 838 }
 839 """)
 840
 841 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 842
 843 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 844             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 845
 846 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 847               "src0 | ((uint64_t)src1 << 32)")
 848
 849 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 850               "src0 | ((uint32_t)src1 << 16)")
 851
 852 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 853 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 854 # are from the low five bits of src0 and src1, respectively.
 855 binop_convert("bfm", tuint32, tint32, "", """
 856 int bits = src0 & 0x1F;
 857 int offset = src1 & 0x1F;
 858 dst = ((1u << bits) - 1) << offset;
 859 """)
 860
 861 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 862 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 863 /* flush denormals to zero. */
 864 if (!isnormal(dst))
 865    dst = copysignf(0.0f, src0);
 866 """)
 867
 868 # Combines the first component of each input to make a 2-component vector.
 869
 870 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 871 dst.x = src0.x;
 872 dst.y = src1.x;
 873 """)
 874
 875 # Byte extraction
 876 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 877 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 878
 879 # Word extraction
 880 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 881 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 882
 883
 884 def triop(name, ty, alg_props, const_expr):
 885    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 886 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 887    opcode(name, output_size, tuint,
 888    [src1_size, src2_size, src3_size],
 889    [tuint, tuint, tuint], False, "", const_expr)
 890
 891 triop("ffma", tfloat, _2src_commutative, """
 892 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 893    if (bit_size == 64)
 894       dst = _mesa_double_fma_rtz(src0, src1, src2);
 895    else if (bit_size == 32)
 896       dst = _mesa_float_fma_rtz(src0, src1, src2);
 897    else
 898       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 899 } else {
 900    if (bit_size == 32)
 901       dst = fmaf(src0, src1, src2);
 902    else
 903       dst = fma(src0, src1, src2);
 904 }
 905 """)
 906
 907 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 908
 909 # Conditional Select
 910 #
 911 # A vector conditional select instruction (like ?:, but operating per-
 912 # component on vectors). There are two versions, one for floating point
 913 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 914
 915
 916 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 917
 918 # 3 way min/max/med
 919 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 920 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 921 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 922
 923 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 924 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 925 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 926
 927 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 928 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 929 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 930
 931 opcode("bcsel", 0, tuint, [0, 0, 0],
 932       [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 933 opcode("b32csel", 0, tuint, [0, 0, 0],
 934        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 935
 936 # SM5 bfi assembly
 937 triop("bfi", tuint32, "", """
 938 unsigned mask = src0, insert = src1, base = src2;
 939 if (mask == 0) {
 940    dst = base;
 941 } else {
 942    unsigned tmp = mask;
 943    while (!(tmp & 1)) {
 944       tmp >>= 1;
 945       insert <<= 1;
 946    }
 947    dst = (base & ~mask) | (insert & mask);
 948 }
 949 """)
 950
 951
 952 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 953
 954 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 955 opcode("ubfe", 0, tuint32,
 956        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 957 unsigned base = src0;
 958 unsigned offset = src1 & 0x1F;
 959 unsigned bits = src2 & 0x1F;
 960 if (bits == 0) {
 961    dst = 0;
 962 } else if (offset + bits < 32) {
 963    dst = (base << (32 - bits - offset)) >> (32 - bits);
 964 } else {
 965    dst = base >> offset;
 966 }
 967 """)
 968 opcode("ibfe", 0, tint32,
 969        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
 970 int base = src0;
 971 unsigned offset = src1 & 0x1F;
 972 unsigned bits = src2 & 0x1F;
 973 if (bits == 0) {
 974    dst = 0;
 975 } else if (offset + bits < 32) {
 976    dst = (base << (32 - bits - offset)) >> (32 - bits);
 977 } else {
 978    dst = base >> offset;
 979 }
 980 """)
 981
 982 # GLSL bitfieldExtract()
 983 opcode("ubitfield_extract", 0, tuint32,
 984        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
 985 unsigned base = src0;
 986 int offset = src1, bits = src2;
 987 if (bits == 0) {
 988    dst = 0;
 989 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 990    dst = 0; /* undefined per the spec */
 991 } else {
 992    dst = (base >> offset) & ((1ull << bits) - 1);
 993 }
 994 """)
 995 opcode("ibitfield_extract", 0, tint32,
 996        [0, 0, 0], [tint32, tint32, tint32], False, "", """
 997 int base = src0;
 998 int offset = src1, bits = src2;
 999 if (bits == 0) {
1000    dst = 0;
1001 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1002    dst = 0;
1003 } else {
1004    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1005 }
1006 """)
1007
1008 # Combines the first component of each input to make a 3-component vector.
1009
1010 triop_horiz("vec3", 3, 1, 1, 1, """
1011 dst.x = src0.x;
1012 dst.y = src1.x;
1013 dst.z = src2.x;
1014 """)
1015
1016 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1017                  src4_size, const_expr):
1018    opcode(name, output_size, tuint,
1019           [src1_size, src2_size, src3_size, src4_size],
1020           [tuint, tuint, tuint, tuint],
1021           False, "", const_expr)
1022
1023 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1024        [tuint32, tuint32, tint32, tint32], False, "", """
1025 unsigned base = src0, insert = src1;
1026 int offset = src2, bits = src3;
1027 if (bits == 0) {
1028    dst = base;
1029 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1030    dst = 0;
1031 } else {
1032    unsigned mask = ((1ull << bits) - 1) << offset;
1033    dst = (base & ~mask) | ((insert << offset) & mask);
1034 }
1035 """)
1036
1037 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1038 dst.x = src0.x;
1039 dst.y = src1.x;
1040 dst.z = src2.x;
1041 dst.w = src3.x;
1042 """)
1043
1044 # An integer multiply instruction for address calculation.  This is
1045 # similar to imul, except that the results are undefined in case of
1046 # overflow.  Overflow is defined according to the size of the variable
1047 # being dereferenced.
1048 #
1049 # This relaxed definition, compared to imul, allows an optimization
1050 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1051 # sources, such that lower precision integer multiplies can be used.
1052 # This is useful on hw that has 24b or perhaps 16b integer multiply
1053 # instructions.
1054 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1055
1056 # ir3-specific instruction that maps directly to mul-add shift high mix,
1057 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1058 # multiplication (imul) on Freedreno backend..
1059 opcode("imadsh_mix16", 1, tint32,
1060        [1, 1, 1], [tint32, tint32, tint32], False, "", """
1061 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1062 """)
1063
1064 # ir3-specific instruction that maps directly to ir3 mad.s24.
1065 #
1066 # 24b multiply into 32b result (with sign extension) plus 32b int
1067 triop("imad24_ir3", tint32, _2src_commutative,
1068       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1069
1070 # 24b multiply into 32b result (with sign extension)
1071 binop("imul24", tint32, _2src_commutative + associative,
1072       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")