src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool32 = "bool32"
  99 tuint = "uint"
 100 tuint16 = "uint16"
 101 tfloat32 = "float32"
 102 tint32 = "int32"
 103 tuint32 = "uint32"
 104 tint64 = "int64"
 105 tuint64 = "uint64"
 106 tfloat64 = "float64"
 107
 108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 109
 110 def type_has_size(type_):
 111     m = _TYPE_SPLIT_RE.match(type_)
 112     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 113     return m.group('bits') is not None
 114
 115 def type_size(type_):
 116     m = _TYPE_SPLIT_RE.match(type_)
 117     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 118     assert m.group('bits') is not None, \
 119            'NIR type string has no bit size: "{}"'.format(type_)
 120     return int(m.group('bits'))
 121
 122 def type_sizes(type_):
 123     if type_has_size(type_):
 124         return [type_size(type_)]
 125     elif type_ == 'bool':
 126         return [1, 32]
 127     elif type_ == 'float':
 128         return [16, 32, 64]
 129     else:
 130         return [1, 8, 16, 32, 64]
 131
 132 def type_base_type(type_):
 133     m = _TYPE_SPLIT_RE.match(type_)
 134     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 135     return m.group('type')
 136
 137 # Operation where the first two sources are commutative.
 138 #
 139 # For 2-source operations, this just mathematical commutativity.  Some
 140 # 3-source operations, like ffma, are only commutative in the first two
 141 # sources.
 142 _2src_commutative = "2src_commutative "
 143 associative = "associative "
 144
 145 # global dictionary of opcodes
 146 opcodes = {}
 147
 148 def opcode(name, output_size, output_type, input_sizes, input_types,
 149            is_conversion, algebraic_properties, const_expr):
 150    assert name not in opcodes
 151    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 152                           input_types, is_conversion, algebraic_properties,
 153                           const_expr)
 154
 155 def unop_convert(name, out_type, in_type, const_expr):
 156    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 157
 158 def unop(name, ty, const_expr):
 159    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 160
 161 def unop_horiz(name, output_size, output_type, input_size, input_type,
 162                const_expr):
 163    opcode(name, output_size, output_type, [input_size], [input_type],
 164           False, "", const_expr)
 165
 166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 167                 reduce_expr, final_expr):
 168    def prereduce(src):
 169       return "(" + prereduce_expr.format(src=src) + ")"
 170    def final(src):
 171       return final_expr.format(src="(" + src + ")")
 172    def reduce_(src0, src1):
 173       return reduce_expr.format(src0=src0, src1=src1)
 174    src0 = prereduce("src0.x")
 175    src1 = prereduce("src0.y")
 176    src2 = prereduce("src0.z")
 177    src3 = prereduce("src0.w")
 178    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 179               final(reduce_(src0, src1)))
 180    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 181               final(reduce_(reduce_(src0, src1), src2)))
 182    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 183               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 184
 185 def unop_numeric_convert(name, out_type, in_type, const_expr):
 186    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 187
 188 unop("mov", tuint, "src0")
 189
 190 unop("ineg", tint, "-src0")
 191 unop("fneg", tfloat, "-src0")
 192 unop("inot", tint, "~src0") # invert every bit of the integer
 193 unop("fsign", tfloat, ("bit_size == 64 ? " +
 194                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 195                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 198 unop("fabs", tfloat, "fabs(src0)")
 199 unop("fsat", tfloat, ("bit_size == 64 ? " +
 200                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 201                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 205 unop("fexp2", tfloat, "exp2f(src0)")
 206 unop("flog2", tfloat, "log2f(src0)")
 207
 208 # Generate all of the numeric conversion opcodes
 209 for src_t in [tint, tuint, tfloat, tbool]:
 210    if src_t == tbool:
 211       dst_types = [tfloat, tint]
 212    elif src_t == tint:
 213       dst_types = [tfloat, tint, tbool]
 214    elif src_t == tuint:
 215       dst_types = [tfloat, tuint]
 216    elif src_t == tfloat:
 217       dst_types = [tint, tuint, tfloat, tbool]
 218
 219    for dst_t in dst_types:
 220       for bit_size in type_sizes(dst_t):
 221           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 222               rnd_modes = ['_rtne', '_rtz', '']
 223               for rnd_mode in rnd_modes:
 224                   if rnd_mode == '_rtne':
 225                       conv_expr = """
 226                       if (bit_size > 16) {
 227                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 228                       } else {
 229                          dst = src0;
 230                       }
 231                       """
 232                   elif rnd_mode == '_rtz':
 233                       conv_expr = """
 234                       if (bit_size > 16) {
 235                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 236                       } else {
 237                          dst = src0;
 238                       }
 239                       """
 240                   else:
 241                       conv_expr = "src0"
 242
 243                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
 244                                                               bit_size, rnd_mode),
 245                                        dst_t + str(bit_size), src_t, conv_expr)
 246           elif bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 247               conv_expr = """
 248               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 249                  dst = _mesa_double_to_float_rtz(src0);
 250               } else {
 251                  dst = src0;
 252               }
 253               """
 254               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 255                                    dst_t + str(bit_size), src_t, conv_expr)
 256           else:
 257               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 258               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 259                                    dst_t + str(bit_size), src_t, conv_expr)
 260
 261
 262 # Unary floating-point rounding operations.
 263
 264
 265 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 266 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 267 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 268 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 269 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 270
 271 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 272
 273 # Trigonometric operations.
 274
 275
 276 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 277 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 278
 279 # dfrexp
 280 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 281 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 282
 283 # Partial derivatives.
 284
 285
 286 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 287 unop("fddy", tfloat, "0.0")
 288 unop("fddx_fine", tfloat, "0.0")
 289 unop("fddy_fine", tfloat, "0.0")
 290 unop("fddx_coarse", tfloat, "0.0")
 291 unop("fddy_coarse", tfloat, "0.0")
 292
 293
 294 # Floating point pack and unpack operations.
 295
 296 def pack_2x16(fmt):
 297    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 298 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 299 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 300 """.replace("fmt", fmt))
 301
 302 def pack_4x8(fmt):
 303    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 304 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 305 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 306 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 307 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 308 """.replace("fmt", fmt))
 309
 310 def unpack_2x16(fmt):
 311    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 312 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 313 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 314 """.replace("fmt", fmt))
 315
 316 def unpack_4x8(fmt):
 317    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 318 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 319 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 320 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 321 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 322 """.replace("fmt", fmt))
 323
 324
 325 pack_2x16("snorm")
 326 pack_4x8("snorm")
 327 pack_2x16("unorm")
 328 pack_4x8("unorm")
 329 pack_2x16("half")
 330 unpack_2x16("snorm")
 331 unpack_4x8("snorm")
 332 unpack_2x16("unorm")
 333 unpack_4x8("unorm")
 334 unpack_2x16("half")
 335
 336 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 337 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 338 """)
 339
 340 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 341 dst.x = (src0.x <<  0) |
 342         (src0.y <<  8) |
 343         (src0.z << 16) |
 344         (src0.w << 24);
 345 """)
 346
 347 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 348            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 349
 350 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 351            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 352
 353 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 354            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 355
 356 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 357            "dst.x = src0.x; dst.y = src0.x >> 32;")
 358
 359 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 360            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 361
 362 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 363            "dst.x = src0.x; dst.y = src0.x >> 16;")
 364
 365 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 366 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 367 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 368 """)
 369
 370 # Lowered floating point unpacking operations.
 371
 372 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 373              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 374 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 375              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 376
 377 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 378              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 379 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 380              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 381
 382 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 383 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 384
 385 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 386 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 387
 388 # Bit operations, part of ARB_gpu_shader5.
 389
 390
 391 unop("bitfield_reverse", tuint32, """
 392 /* we're not winning any awards for speed here, but that's ok */
 393 dst = 0;
 394 for (unsigned bit = 0; bit < 32; bit++)
 395    dst |= ((src0 >> bit) & 1) << (31 - bit);
 396 """)
 397 unop_convert("bit_count", tuint32, tuint, """
 398 dst = 0;
 399 for (unsigned bit = 0; bit < bit_size; bit++) {
 400    if ((src0 >> bit) & 1)
 401       dst++;
 402 }
 403 """)
 404
 405 unop_convert("ufind_msb", tint32, tuint, """
 406 dst = -1;
 407 for (int bit = bit_size - 1; bit >= 0; bit--) {
 408    if ((src0 >> bit) & 1) {
 409       dst = bit;
 410       break;
 411    }
 412 }
 413 """)
 414
 415 unop("ifind_msb", tint32, """
 416 dst = -1;
 417 for (int bit = 31; bit >= 0; bit--) {
 418    /* If src0 < 0, we're looking for the first 0 bit.
 419     * if src0 >= 0, we're looking for the first 1 bit.
 420     */
 421    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 422       (!((src0 >> bit) & 1) && (src0 < 0))) {
 423       dst = bit;
 424       break;
 425    }
 426 }
 427 """)
 428
 429 unop_convert("find_lsb", tint32, tint, """
 430 dst = -1;
 431 for (unsigned bit = 0; bit < bit_size; bit++) {
 432    if ((src0 >> bit) & 1) {
 433       dst = bit;
 434       break;
 435    }
 436 }
 437 """)
 438
 439
 440 for i in range(1, 5):
 441    for j in range(1, 5):
 442       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 443
 444
 445 # AMD_gcn_shader extended instructions
 446 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 447 dst.x = dst.y = 0.0;
 448 float absX = fabs(src0.x);
 449 float absY = fabs(src0.y);
 450 float absZ = fabs(src0.z);
 451
 452 float ma = 0.0;
 453 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 454 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 455 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 456
 457 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 458 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 459 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 460 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 461 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 462 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 463
 464 dst.x = dst.x / ma + 0.5;
 465 dst.y = dst.y / ma + 0.5;
 466 """)
 467
 468 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 469 float absX = fabs(src0.x);
 470 float absY = fabs(src0.y);
 471 float absZ = fabs(src0.z);
 472 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 473 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 474 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 475 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 476 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 477 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 478 """)
 479
 480 # Sum of vector components
 481 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 482
 483 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 484    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 485           False, alg_props, const_expr)
 486
 487 def binop(name, ty, alg_props, const_expr):
 488    binop_convert(name, ty, ty, alg_props, const_expr)
 489
 490 def binop_compare(name, ty, alg_props, const_expr):
 491    binop_convert(name, tbool1, ty, alg_props, const_expr)
 492
 493 def binop_compare32(name, ty, alg_props, const_expr):
 494    binop_convert(name, tbool32, ty, alg_props, const_expr)
 495
 496 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 497                 src2_type, const_expr):
 498    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 499           False, "", const_expr)
 500
 501 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 502                  reduce_expr, final_expr):
 503    def final(src):
 504       return final_expr.format(src= "(" + src + ")")
 505    def reduce_(src0, src1):
 506       return reduce_expr.format(src0=src0, src1=src1)
 507    def prereduce(src0, src1):
 508       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 509    src0 = prereduce("src0.x", "src1.x")
 510    src1 = prereduce("src0.y", "src1.y")
 511    src2 = prereduce("src0.z", "src1.z")
 512    src3 = prereduce("src0.w", "src1.w")
 513    opcode(name + "2", output_size, output_type,
 514           [2, 2], [src_type, src_type], False, _2src_commutative,
 515           final(reduce_(src0, src1)))
 516    opcode(name + "3", output_size, output_type,
 517           [3, 3], [src_type, src_type], False, _2src_commutative,
 518           final(reduce_(reduce_(src0, src1), src2)))
 519    opcode(name + "4", output_size, output_type,
 520           [4, 4], [src_type, src_type], False, _2src_commutative,
 521           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 522
 523 binop("fadd", tfloat, _2src_commutative + associative,"""
 524 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 525    if (bit_size == 64)
 526       dst = _mesa_double_add_rtz(src0, src1);
 527    else
 528       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 529 } else {
 530    dst = src0 + src1;
 531 }
 532 """)
 533 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 534 binop("iadd_sat", tint, _2src_commutative, """
 535       src1 > 0 ?
 536          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 537          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 538 """)
 539 binop("uadd_sat", tuint, _2src_commutative,
 540       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 541 binop("isub_sat", tint, "", """
 542       src1 < 0 ?
 543          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 544          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 545 """)
 546 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 547
 548 binop("fsub", tfloat, "", """
 549 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 550    if (bit_size == 64)
 551       dst = _mesa_double_sub_rtz(src0, src1);
 552    else
 553       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 554 } else {
 555    dst = src0 - src1;
 556 }
 557 """)
 558 binop("isub", tint, "", "src0 - src1")
 559
 560 binop("fmul", tfloat, _2src_commutative + associative, """
 561 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 562    if (bit_size == 64)
 563       dst = _mesa_double_mul_rtz(src0, src1);
 564    else
 565       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 566 } else {
 567    dst = src0 * src1;
 568 }
 569 """)
 570 # low 32-bits of signed/unsigned integer multiply
 571 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 572
 573 # Generate 64 bit result from 2 32 bits quantity
 574 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 575               "(int64_t)src0 * (int64_t)src1")
 576 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 577               "(uint64_t)src0 * (uint64_t)src1")
 578
 579 # high 32-bits of signed integer multiply
 580 binop("imul_high", tint, _2src_commutative, """
 581 if (bit_size == 64) {
 582    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 583     * extension to work properly.  The casts are kind-of annoying but needed
 584     * to prevent compiler warnings.
 585     */
 586    uint32_t src0_u32[4] = {
 587       src0,
 588       (int64_t)src0 >> 32,
 589       (int64_t)src0 >> 63,
 590       (int64_t)src0 >> 63,
 591    };
 592    uint32_t src1_u32[4] = {
 593       src1,
 594       (int64_t)src1 >> 32,
 595       (int64_t)src1 >> 63,
 596       (int64_t)src1 >> 63,
 597    };
 598    uint32_t prod_u32[4];
 599    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 600    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 601 } else {
 602    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 603 }
 604 """)
 605
 606 # high 32-bits of unsigned integer multiply
 607 binop("umul_high", tuint, _2src_commutative, """
 608 if (bit_size == 64) {
 609    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 610    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 611    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 612    uint32_t prod_u32[4];
 613    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 614    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 615 } else {
 616    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 617 }
 618 """)
 619
 620 # low 32-bits of unsigned integer multiply
 621 binop("umul_low", tuint32, _2src_commutative, """
 622 uint64_t mask = (1 << (bit_size / 2)) - 1;
 623 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 624 """)
 625
 626
 627 binop("fdiv", tfloat, "", "src0 / src1")
 628 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 629 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 630
 631 # returns a boolean representing the carry resulting from the addition of
 632 # the two unsigned arguments.
 633
 634 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 635
 636 # returns a boolean representing the borrow resulting from the subtraction
 637 # of the two unsigned arguments.
 638
 639 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 640
 641 # hadd: (a + b) >> 1 (without overflow)
 642 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 643 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 644 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 645 #       =     ((x & y) << 1) + (x ^ y)
 646 #
 647 # Since we know that the bottom bit of (x & y) << 1 is zero,
 648 #
 649 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 650 #              =   (x & y) +      ((x ^ y)  >> 1)
 651 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 652 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 653
 654 # rhadd: (a + b + 1) >> 1 (without overflow)
 655 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 656 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 657 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 658 #           =     ((x | y) << 1) - (x ^ y) + 1
 659 #
 660 # Since we know that the bottom bit of (x & y) << 1 is zero,
 661 #
 662 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 663 #                  = (x | y) -  ((x ^ y)      >> 1)
 664 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 665 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 666
 667 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 668
 669 # For signed integers, there are several different possible definitions of
 670 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 671 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 672 # operation while the imod opcode implements the more mathematical
 673 # "modulus" operation.  For details on the difference, see
 674 #
 675 # http://mathforum.org/library/drmath/view/52343.html
 676
 677 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 678 binop("imod", tint, "",
 679       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 680       "                 src0 % src1 : src0 % src1 + src1)")
 681 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 682 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 683
 684 #
 685 # Comparisons
 686 #
 687
 688
 689 # these integer-aware comparisons return a boolean (0 or ~0)
 690
 691 binop_compare("flt", tfloat, "", "src0 < src1")
 692 binop_compare("fge", tfloat, "", "src0 >= src1")
 693 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
 694 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
 695 binop_compare("ilt", tint, "", "src0 < src1")
 696 binop_compare("ige", tint, "", "src0 >= src1")
 697 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
 698 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
 699 binop_compare("ult", tuint, "", "src0 < src1")
 700 binop_compare("uge", tuint, "", "src0 >= src1")
 701 binop_compare32("flt32", tfloat, "", "src0 < src1")
 702 binop_compare32("fge32", tfloat, "", "src0 >= src1")
 703 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
 704 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
 705 binop_compare32("ilt32", tint, "", "src0 < src1")
 706 binop_compare32("ige32", tint, "", "src0 >= src1")
 707 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
 708 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
 709 binop_compare32("ult32", tuint, "", "src0 < src1")
 710 binop_compare32("uge32", tuint, "", "src0 >= src1")
 711
 712 # integer-aware GLSL-style comparisons that compare floats and ints
 713
 714 binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
 715              "{src0} && {src1}", "{src}")
 716 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
 717              "{src0} || {src1}", "{src}")
 718 binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
 719              "{src0} && {src1}", "{src}")
 720 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
 721              "{src0} || {src1}", "{src}")
 722
 723 binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 724              "{src0} && {src1}", "{src}")
 725 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 726              "{src0} || {src1}", "{src}")
 727 binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
 728              "{src0} && {src1}", "{src}")
 729 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
 730              "{src0} || {src1}", "{src}")
 731
 732 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 733
 734 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 735              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 736 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 737              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 738
 739 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 740 # and false respectively
 741
 742 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 743 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 744 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 745 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 746
 747 # SPIRV shifts are undefined for shift-operands >= bitsize,
 748 # but SM5 shifts are defined to use the least significant bits, only
 749 # The NIR definition is according to the SM5 specification.
 750 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 751        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 752 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 753        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 754 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 755        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 756
 757 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 758    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 759    dst = (src0 << (src1 & rotate_mask)) |
 760          (src0 >> (-src1 & rotate_mask));
 761 """)
 762 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 763    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 764    dst = (src0 >> (src1 & rotate_mask)) |
 765          (src0 << (-src1 & rotate_mask));
 766 """)
 767
 768 # bitwise logic operators
 769 #
 770 # These are also used as boolean and, or, xor for hardware supporting
 771 # integers.
 772
 773
 774 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 775 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 776 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 777
 778
 779 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 780              "{src}")
 781
 782 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 783              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 784
 785 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 786        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 787 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 788        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 789
 790 binop("fmin", tfloat, "", "fmin(src0, src1)")
 791 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 792 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 793 binop("fmax", tfloat, "", "fmax(src0, src1)")
 794 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 795 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 796
 797 # Saturated vector add for 4 8bit ints.
 798 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 799 dst = 0;
 800 for (int i = 0; i < 32; i += 8) {
 801    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 802 }
 803 """)
 804
 805 # Saturated vector subtract for 4 8bit ints.
 806 binop("ussub_4x8", tint32, "", """
 807 dst = 0;
 808 for (int i = 0; i < 32; i += 8) {
 809    int src0_chan = (src0 >> i) & 0xff;
 810    int src1_chan = (src1 >> i) & 0xff;
 811    if (src0_chan > src1_chan)
 812       dst |= (src0_chan - src1_chan) << i;
 813 }
 814 """)
 815
 816 # vector min for 4 8bit ints.
 817 binop("umin_4x8", tint32, _2src_commutative + associative, """
 818 dst = 0;
 819 for (int i = 0; i < 32; i += 8) {
 820    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 821 }
 822 """)
 823
 824 # vector max for 4 8bit ints.
 825 binop("umax_4x8", tint32, _2src_commutative + associative, """
 826 dst = 0;
 827 for (int i = 0; i < 32; i += 8) {
 828    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 829 }
 830 """)
 831
 832 # unorm multiply: (a * b) / 255.
 833 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 834 dst = 0;
 835 for (int i = 0; i < 32; i += 8) {
 836    int src0_chan = (src0 >> i) & 0xff;
 837    int src1_chan = (src1 >> i) & 0xff;
 838    dst |= ((src0_chan * src1_chan) / 255) << i;
 839 }
 840 """)
 841
 842 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 843
 844 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 845             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 846
 847 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 848               "src0 | ((uint64_t)src1 << 32)")
 849
 850 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 851               "src0 | ((uint32_t)src1 << 16)")
 852
 853 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 854 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 855 # are from the low five bits of src0 and src1, respectively.
 856 binop_convert("bfm", tuint32, tint32, "", """
 857 int bits = src0 & 0x1F;
 858 int offset = src1 & 0x1F;
 859 dst = ((1u << bits) - 1) << offset;
 860 """)
 861
 862 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 863 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 864 /* flush denormals to zero. */
 865 if (!isnormal(dst))
 866    dst = copysignf(0.0f, src0);
 867 """)
 868
 869 # Combines the first component of each input to make a 2-component vector.
 870
 871 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 872 dst.x = src0.x;
 873 dst.y = src1.x;
 874 """)
 875
 876 # Byte extraction
 877 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 878 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 879
 880 # Word extraction
 881 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 882 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 883
 884
 885 def triop(name, ty, alg_props, const_expr):
 886    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 887 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 888    opcode(name, output_size, tuint,
 889    [src1_size, src2_size, src3_size],
 890    [tuint, tuint, tuint], False, "", const_expr)
 891
 892 triop("ffma", tfloat, _2src_commutative, """
 893 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 894    if (bit_size == 64)
 895       dst = _mesa_double_fma_rtz(src0, src1, src2);
 896    else if (bit_size == 32)
 897       dst = _mesa_float_fma_rtz(src0, src1, src2);
 898    else
 899       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 900 } else {
 901    if (bit_size == 32)
 902       dst = fmaf(src0, src1, src2);
 903    else
 904       dst = fma(src0, src1, src2);
 905 }
 906 """)
 907
 908 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 909
 910 # Conditional Select
 911 #
 912 # A vector conditional select instruction (like ?:, but operating per-
 913 # component on vectors). There are two versions, one for floating point
 914 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 915
 916
 917 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 918
 919 # 3 way min/max/med
 920 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 921 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 922 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 923
 924 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 925 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 926 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 927
 928 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 929 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 930 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 931
 932 opcode("bcsel", 0, tuint, [0, 0, 0],
 933       [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 934 opcode("b32csel", 0, tuint, [0, 0, 0],
 935        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 936
 937 # SM5 bfi assembly
 938 triop("bfi", tuint32, "", """
 939 unsigned mask = src0, insert = src1, base = src2;
 940 if (mask == 0) {
 941    dst = base;
 942 } else {
 943    unsigned tmp = mask;
 944    while (!(tmp & 1)) {
 945       tmp >>= 1;
 946       insert <<= 1;
 947    }
 948    dst = (base & ~mask) | (insert & mask);
 949 }
 950 """)
 951
 952
 953 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 954
 955 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 956 opcode("ubfe", 0, tuint32,
 957        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 958 unsigned base = src0;
 959 unsigned offset = src1 & 0x1F;
 960 unsigned bits = src2 & 0x1F;
 961 if (bits == 0) {
 962    dst = 0;
 963 } else if (offset + bits < 32) {
 964    dst = (base << (32 - bits - offset)) >> (32 - bits);
 965 } else {
 966    dst = base >> offset;
 967 }
 968 """)
 969 opcode("ibfe", 0, tint32,
 970        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
 971 int base = src0;
 972 unsigned offset = src1 & 0x1F;
 973 unsigned bits = src2 & 0x1F;
 974 if (bits == 0) {
 975    dst = 0;
 976 } else if (offset + bits < 32) {
 977    dst = (base << (32 - bits - offset)) >> (32 - bits);
 978 } else {
 979    dst = base >> offset;
 980 }
 981 """)
 982
 983 # GLSL bitfieldExtract()
 984 opcode("ubitfield_extract", 0, tuint32,
 985        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
 986 unsigned base = src0;
 987 int offset = src1, bits = src2;
 988 if (bits == 0) {
 989    dst = 0;
 990 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 991    dst = 0; /* undefined per the spec */
 992 } else {
 993    dst = (base >> offset) & ((1ull << bits) - 1);
 994 }
 995 """)
 996 opcode("ibitfield_extract", 0, tint32,
 997        [0, 0, 0], [tint32, tint32, tint32], False, "", """
 998 int base = src0;
 999 int offset = src1, bits = src2;
1000 if (bits == 0) {
1001    dst = 0;
1002 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1003    dst = 0;
1004 } else {
1005    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1006 }
1007 """)
1008
1009 # Combines the first component of each input to make a 3-component vector.
1010
1011 triop_horiz("vec3", 3, 1, 1, 1, """
1012 dst.x = src0.x;
1013 dst.y = src1.x;
1014 dst.z = src2.x;
1015 """)
1016
1017 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1018                  src4_size, const_expr):
1019    opcode(name, output_size, tuint,
1020           [src1_size, src2_size, src3_size, src4_size],
1021           [tuint, tuint, tuint, tuint],
1022           False, "", const_expr)
1023
1024 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1025        [tuint32, tuint32, tint32, tint32], False, "", """
1026 unsigned base = src0, insert = src1;
1027 int offset = src2, bits = src3;
1028 if (bits == 0) {
1029    dst = base;
1030 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1031    dst = 0;
1032 } else {
1033    unsigned mask = ((1ull << bits) - 1) << offset;
1034    dst = (base & ~mask) | ((insert << offset) & mask);
1035 }
1036 """)
1037
1038 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1039 dst.x = src0.x;
1040 dst.y = src1.x;
1041 dst.z = src2.x;
1042 dst.w = src3.x;
1043 """)
1044
1045 # ir3-specific instruction that maps directly to mul-add shift high mix,
1046 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1047 # multiplication (imul) on Freedreno backend..
1048 opcode("imadsh_mix16", 1, tint32,
1049        [1, 1, 1], [tint32, tint32, tint32], False, "", """
1050 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1051 """)