src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool16 = "bool16"
  99 tbool32 = "bool32"
 100 tuint = "uint"
 101 tuint16 = "uint16"
 102 tfloat32 = "float32"
 103 tint32 = "int32"
 104 tuint32 = "uint32"
 105 tint64 = "int64"
 106 tuint64 = "uint64"
 107 tfloat64 = "float64"
 108
 109 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 110
 111 def type_has_size(type_):
 112     m = _TYPE_SPLIT_RE.match(type_)
 113     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 114     return m.group('bits') is not None
 115
 116 def type_size(type_):
 117     m = _TYPE_SPLIT_RE.match(type_)
 118     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 119     assert m.group('bits') is not None, \
 120            'NIR type string has no bit size: "{}"'.format(type_)
 121     return int(m.group('bits'))
 122
 123 def type_sizes(type_):
 124     if type_has_size(type_):
 125         return [type_size(type_)]
 126     elif type_ == 'bool':
 127         return [1, 16, 32]
 128     elif type_ == 'float':
 129         return [16, 32, 64]
 130     else:
 131         return [1, 8, 16, 32, 64]
 132
 133 def type_base_type(type_):
 134     m = _TYPE_SPLIT_RE.match(type_)
 135     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 136     return m.group('type')
 137
 138 # Operation where the first two sources are commutative.
 139 #
 140 # For 2-source operations, this just mathematical commutativity.  Some
 141 # 3-source operations, like ffma, are only commutative in the first two
 142 # sources.
 143 _2src_commutative = "2src_commutative "
 144 associative = "associative "
 145
 146 # global dictionary of opcodes
 147 opcodes = {}
 148
 149 def opcode(name, output_size, output_type, input_sizes, input_types,
 150            is_conversion, algebraic_properties, const_expr):
 151    assert name not in opcodes
 152    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 153                           input_types, is_conversion, algebraic_properties,
 154                           const_expr)
 155
 156 def unop_convert(name, out_type, in_type, const_expr):
 157    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 158
 159 def unop(name, ty, const_expr):
 160    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 161
 162 def unop_horiz(name, output_size, output_type, input_size, input_type,
 163                const_expr):
 164    opcode(name, output_size, output_type, [input_size], [input_type],
 165           False, "", const_expr)
 166
 167 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 168                 reduce_expr, final_expr):
 169    def prereduce(src):
 170       return "(" + prereduce_expr.format(src=src) + ")"
 171    def final(src):
 172       return final_expr.format(src="(" + src + ")")
 173    def reduce_(src0, src1):
 174       return reduce_expr.format(src0=src0, src1=src1)
 175    src0 = prereduce("src0.x")
 176    src1 = prereduce("src0.y")
 177    src2 = prereduce("src0.z")
 178    src3 = prereduce("src0.w")
 179    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 180               final(reduce_(src0, src1)))
 181    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 182               final(reduce_(reduce_(src0, src1), src2)))
 183    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 184               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 185
 186 def unop_numeric_convert(name, out_type, in_type, const_expr):
 187    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 188
 189 unop("mov", tuint, "src0")
 190
 191 unop("ineg", tint, "-src0")
 192 unop("fneg", tfloat, "-src0")
 193 unop("inot", tint, "~src0") # invert every bit of the integer
 194 unop("fsign", tfloat, ("bit_size == 64 ? " +
 195                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 196                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 197 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 198 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 199 unop("fabs", tfloat, "fabs(src0)")
 200 unop("fsat", tfloat, ("bit_size == 64 ? " +
 201                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 202                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 203 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 204 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 205 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 206 unop("fexp2", tfloat, "exp2f(src0)")
 207 unop("flog2", tfloat, "log2f(src0)")
 208
 209 # Generate all of the numeric conversion opcodes
 210 for src_t in [tint, tuint, tfloat, tbool]:
 211    if src_t == tbool:
 212       dst_types = [tfloat, tint]
 213    elif src_t == tint:
 214       dst_types = [tfloat, tint, tbool]
 215    elif src_t == tuint:
 216       dst_types = [tfloat, tuint]
 217    elif src_t == tfloat:
 218       dst_types = [tint, tuint, tfloat, tbool]
 219
 220    for dst_t in dst_types:
 221       for dst_bit_size in type_sizes(dst_t):
 222           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 223               rnd_modes = ['_rtne', '_rtz', '']
 224               for rnd_mode in rnd_modes:
 225                   if rnd_mode == '_rtne':
 226                       conv_expr = """
 227                       if (bit_size > 16) {
 228                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 229                       } else {
 230                          dst = src0;
 231                       }
 232                       """
 233                   elif rnd_mode == '_rtz':
 234                       conv_expr = """
 235                       if (bit_size > 16) {
 236                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 237                       } else {
 238                          dst = src0;
 239                       }
 240                       """
 241                   else:
 242                       conv_expr = "src0"
 243
 244                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 245                                                               dst_t[0],
 246                                                               dst_bit_size,
 247                                                               rnd_mode),
 248                                        dst_t + str(dst_bit_size),
 249                                        src_t, conv_expr)
 250           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 251               conv_expr = """
 252               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 253                  dst = _mesa_double_to_float_rtz(src0);
 254               } else {
 255                  dst = src0;
 256               }
 257               """
 258               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 259                                                        dst_bit_size),
 260                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 261           else:
 262               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 263               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 264                                                        dst_bit_size),
 265                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 266
 267
 268 # Unary floating-point rounding operations.
 269
 270
 271 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 272 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 273 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 274 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 275 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 276
 277 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 278
 279 # Trigonometric operations.
 280
 281
 282 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 283 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 284
 285 # dfrexp
 286 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 287 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 288
 289 # Partial derivatives.
 290
 291
 292 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 293 unop("fddy", tfloat, "0.0")
 294 unop("fddx_fine", tfloat, "0.0")
 295 unop("fddy_fine", tfloat, "0.0")
 296 unop("fddx_coarse", tfloat, "0.0")
 297 unop("fddy_coarse", tfloat, "0.0")
 298
 299
 300 # Floating point pack and unpack operations.
 301
 302 def pack_2x16(fmt):
 303    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 304 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 305 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 306 """.replace("fmt", fmt))
 307
 308 def pack_4x8(fmt):
 309    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 310 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 311 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 312 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 313 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 314 """.replace("fmt", fmt))
 315
 316 def unpack_2x16(fmt):
 317    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 318 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 319 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 320 """.replace("fmt", fmt))
 321
 322 def unpack_4x8(fmt):
 323    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 324 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 325 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 326 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 327 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 328 """.replace("fmt", fmt))
 329
 330
 331 pack_2x16("snorm")
 332 pack_4x8("snorm")
 333 pack_2x16("unorm")
 334 pack_4x8("unorm")
 335 pack_2x16("half")
 336 unpack_2x16("snorm")
 337 unpack_4x8("snorm")
 338 unpack_2x16("unorm")
 339 unpack_4x8("unorm")
 340 unpack_2x16("half")
 341
 342 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 343 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 344 """)
 345
 346 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 347 dst.x = (src0.x <<  0) |
 348         (src0.y <<  8) |
 349         (src0.z << 16) |
 350         (src0.w << 24);
 351 """)
 352
 353 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 354            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 355
 356 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 357            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 358
 359 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 360            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 361
 362 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 363            "dst.x = src0.x; dst.y = src0.x >> 32;")
 364
 365 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 366            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 367
 368 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 369            "dst.x = src0.x; dst.y = src0.x >> 16;")
 370
 371 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 372 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 373 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 374 """)
 375
 376 # Lowered floating point unpacking operations.
 377
 378 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 379              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 380 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 381              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 382
 383 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 384              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 385 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 386              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 387
 388 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 389 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 390
 391 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 392 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 393
 394 # Bit operations, part of ARB_gpu_shader5.
 395
 396
 397 unop("bitfield_reverse", tuint32, """
 398 /* we're not winning any awards for speed here, but that's ok */
 399 dst = 0;
 400 for (unsigned bit = 0; bit < 32; bit++)
 401    dst |= ((src0 >> bit) & 1) << (31 - bit);
 402 """)
 403 unop_convert("bit_count", tuint32, tuint, """
 404 dst = 0;
 405 for (unsigned bit = 0; bit < bit_size; bit++) {
 406    if ((src0 >> bit) & 1)
 407       dst++;
 408 }
 409 """)
 410
 411 unop_convert("ufind_msb", tint32, tuint, """
 412 dst = -1;
 413 for (int bit = bit_size - 1; bit >= 0; bit--) {
 414    if ((src0 >> bit) & 1) {
 415       dst = bit;
 416       break;
 417    }
 418 }
 419 """)
 420
 421 unop("ifind_msb", tint32, """
 422 dst = -1;
 423 for (int bit = 31; bit >= 0; bit--) {
 424    /* If src0 < 0, we're looking for the first 0 bit.
 425     * if src0 >= 0, we're looking for the first 1 bit.
 426     */
 427    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 428       (!((src0 >> bit) & 1) && (src0 < 0))) {
 429       dst = bit;
 430       break;
 431    }
 432 }
 433 """)
 434
 435 unop_convert("find_lsb", tint32, tint, """
 436 dst = -1;
 437 for (unsigned bit = 0; bit < bit_size; bit++) {
 438    if ((src0 >> bit) & 1) {
 439       dst = bit;
 440       break;
 441    }
 442 }
 443 """)
 444
 445
 446 for i in range(1, 5):
 447    for j in range(1, 5):
 448       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 449
 450
 451 # AMD_gcn_shader extended instructions
 452 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 453 dst.x = dst.y = 0.0;
 454 float absX = fabs(src0.x);
 455 float absY = fabs(src0.y);
 456 float absZ = fabs(src0.z);
 457
 458 float ma = 0.0;
 459 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 460 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 461 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 462
 463 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 464 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 465 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 466 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 467 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 468 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 469
 470 dst.x = dst.x / ma + 0.5;
 471 dst.y = dst.y / ma + 0.5;
 472 """)
 473
 474 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 475 float absX = fabs(src0.x);
 476 float absY = fabs(src0.y);
 477 float absZ = fabs(src0.z);
 478 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 479 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 480 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 481 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 482 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 483 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 484 """)
 485
 486 # Sum of vector components
 487 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 488
 489 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 490    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 491           False, alg_props, const_expr)
 492
 493 def binop(name, ty, alg_props, const_expr):
 494    binop_convert(name, ty, ty, alg_props, const_expr)
 495
 496 def binop_compare(name, ty, alg_props, const_expr):
 497    binop_convert(name, tbool1, ty, alg_props, const_expr)
 498
 499 def binop_compare16(name, ty, alg_props, const_expr):
 500    binop_convert(name, tbool16, ty, alg_props, const_expr)
 501
 502 def binop_compare32(name, ty, alg_props, const_expr):
 503    binop_convert(name, tbool32, ty, alg_props, const_expr)
 504
 505 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 506    binop_compare(name, ty, alg_props, const_expr)
 507    binop_compare16(name + "16", ty, alg_props, const_expr)
 508    binop_compare32(name + "32", ty, alg_props, const_expr)
 509
 510 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 511                 src2_type, const_expr):
 512    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 513           False, "", const_expr)
 514
 515 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 516                  reduce_expr, final_expr):
 517    def final(src):
 518       return final_expr.format(src= "(" + src + ")")
 519    def reduce_(src0, src1):
 520       return reduce_expr.format(src0=src0, src1=src1)
 521    def prereduce(src0, src1):
 522       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 523    src0 = prereduce("src0.x", "src1.x")
 524    src1 = prereduce("src0.y", "src1.y")
 525    src2 = prereduce("src0.z", "src1.z")
 526    src3 = prereduce("src0.w", "src1.w")
 527    opcode(name + "2", output_size, output_type,
 528           [2, 2], [src_type, src_type], False, _2src_commutative,
 529           final(reduce_(src0, src1)))
 530    opcode(name + "3", output_size, output_type,
 531           [3, 3], [src_type, src_type], False, _2src_commutative,
 532           final(reduce_(reduce_(src0, src1), src2)))
 533    opcode(name + "4", output_size, output_type,
 534           [4, 4], [src_type, src_type], False, _2src_commutative,
 535           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 536
 537 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 538                            reduce_expr, final_expr):
 539    binop_reduce(name, output_size, tbool1, src_type,
 540                 prereduce_expr, reduce_expr, final_expr)
 541    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 542                 prereduce_expr, reduce_expr, final_expr)
 543    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 544                 prereduce_expr, reduce_expr, final_expr)
 545
 546 binop("fadd", tfloat, _2src_commutative + associative,"""
 547 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 548    if (bit_size == 64)
 549       dst = _mesa_double_add_rtz(src0, src1);
 550    else
 551       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 552 } else {
 553    dst = src0 + src1;
 554 }
 555 """)
 556 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 557 binop("iadd_sat", tint, _2src_commutative, """
 558       src1 > 0 ?
 559          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 560          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 561 """)
 562 binop("uadd_sat", tuint, _2src_commutative,
 563       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 564 binop("isub_sat", tint, "", """
 565       src1 < 0 ?
 566          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 567          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 568 """)
 569 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 570
 571 binop("fsub", tfloat, "", """
 572 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 573    if (bit_size == 64)
 574       dst = _mesa_double_sub_rtz(src0, src1);
 575    else
 576       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 577 } else {
 578    dst = src0 - src1;
 579 }
 580 """)
 581 binop("isub", tint, "", "src0 - src1")
 582
 583 binop("fmul", tfloat, _2src_commutative + associative, """
 584 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 585    if (bit_size == 64)
 586       dst = _mesa_double_mul_rtz(src0, src1);
 587    else
 588       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 589 } else {
 590    dst = src0 * src1;
 591 }
 592 """)
 593 # low 32-bits of signed/unsigned integer multiply
 594 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 595
 596 # Generate 64 bit result from 2 32 bits quantity
 597 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 598               "(int64_t)src0 * (int64_t)src1")
 599 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 600               "(uint64_t)src0 * (uint64_t)src1")
 601
 602 # high 32-bits of signed integer multiply
 603 binop("imul_high", tint, _2src_commutative, """
 604 if (bit_size == 64) {
 605    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 606     * extension to work properly.  The casts are kind-of annoying but needed
 607     * to prevent compiler warnings.
 608     */
 609    uint32_t src0_u32[4] = {
 610       src0,
 611       (int64_t)src0 >> 32,
 612       (int64_t)src0 >> 63,
 613       (int64_t)src0 >> 63,
 614    };
 615    uint32_t src1_u32[4] = {
 616       src1,
 617       (int64_t)src1 >> 32,
 618       (int64_t)src1 >> 63,
 619       (int64_t)src1 >> 63,
 620    };
 621    uint32_t prod_u32[4];
 622    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 623    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 624 } else {
 625    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 626 }
 627 """)
 628
 629 # high 32-bits of unsigned integer multiply
 630 binop("umul_high", tuint, _2src_commutative, """
 631 if (bit_size == 64) {
 632    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 633    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 634    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 635    uint32_t prod_u32[4];
 636    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 637    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 638 } else {
 639    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 640 }
 641 """)
 642
 643 # low 32-bits of unsigned integer multiply
 644 binop("umul_low", tuint32, _2src_commutative, """
 645 uint64_t mask = (1 << (bit_size / 2)) - 1;
 646 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 647 """)
 648
 649
 650 binop("fdiv", tfloat, "", "src0 / src1")
 651 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 652 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 653
 654 # returns a boolean representing the carry resulting from the addition of
 655 # the two unsigned arguments.
 656
 657 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 658
 659 # returns a boolean representing the borrow resulting from the subtraction
 660 # of the two unsigned arguments.
 661
 662 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 663
 664 # hadd: (a + b) >> 1 (without overflow)
 665 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 666 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 667 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 668 #       =     ((x & y) << 1) + (x ^ y)
 669 #
 670 # Since we know that the bottom bit of (x & y) << 1 is zero,
 671 #
 672 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 673 #              =   (x & y) +      ((x ^ y)  >> 1)
 674 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 675 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 676
 677 # rhadd: (a + b + 1) >> 1 (without overflow)
 678 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 679 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 680 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 681 #           =     ((x | y) << 1) - (x ^ y) + 1
 682 #
 683 # Since we know that the bottom bit of (x & y) << 1 is zero,
 684 #
 685 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 686 #                  = (x | y) -  ((x ^ y)      >> 1)
 687 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 688 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 689
 690 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 691
 692 # For signed integers, there are several different possible definitions of
 693 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 694 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 695 # operation while the imod opcode implements the more mathematical
 696 # "modulus" operation.  For details on the difference, see
 697 #
 698 # http://mathforum.org/library/drmath/view/52343.html
 699
 700 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 701 binop("imod", tint, "",
 702       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 703       "                 src0 % src1 : src0 % src1 + src1)")
 704 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 705 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 706
 707 #
 708 # Comparisons
 709 #
 710
 711
 712 # these integer-aware comparisons return a boolean (0 or ~0)
 713
 714 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 715 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 716 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 717 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 718 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 719 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 720 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 721 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 722 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 723 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 724
 725 # integer-aware GLSL-style comparisons that compare floats and ints
 726
 727 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 728                        "{src0} && {src1}", "{src}")
 729 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 730                        "{src0} || {src1}", "{src}")
 731 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 732                        "{src0} && {src1}", "{src}")
 733 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 734                        "{src0} || {src1}", "{src}")
 735
 736 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 737
 738 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 739              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 740 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 741              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 742
 743 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 744 # and false respectively
 745
 746 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 747 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 748 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 749 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 750
 751 # SPIRV shifts are undefined for shift-operands >= bitsize,
 752 # but SM5 shifts are defined to use the least significant bits, only
 753 # The NIR definition is according to the SM5 specification.
 754 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 755        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 756 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 757        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 758 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 759        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 760
 761 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 762    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 763    dst = (src0 << (src1 & rotate_mask)) |
 764          (src0 >> (-src1 & rotate_mask));
 765 """)
 766 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 767    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 768    dst = (src0 >> (src1 & rotate_mask)) |
 769          (src0 << (-src1 & rotate_mask));
 770 """)
 771
 772 # bitwise logic operators
 773 #
 774 # These are also used as boolean and, or, xor for hardware supporting
 775 # integers.
 776
 777
 778 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 779 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 780 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 781
 782
 783 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 784              "{src}")
 785
 786 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 787              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 788
 789 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 790        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 791 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 792        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 793
 794 binop("fmin", tfloat, "", "fmin(src0, src1)")
 795 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 796 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 797 binop("fmax", tfloat, "", "fmax(src0, src1)")
 798 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 799 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 800
 801 # Saturated vector add for 4 8bit ints.
 802 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 803 dst = 0;
 804 for (int i = 0; i < 32; i += 8) {
 805    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 806 }
 807 """)
 808
 809 # Saturated vector subtract for 4 8bit ints.
 810 binop("ussub_4x8", tint32, "", """
 811 dst = 0;
 812 for (int i = 0; i < 32; i += 8) {
 813    int src0_chan = (src0 >> i) & 0xff;
 814    int src1_chan = (src1 >> i) & 0xff;
 815    if (src0_chan > src1_chan)
 816       dst |= (src0_chan - src1_chan) << i;
 817 }
 818 """)
 819
 820 # vector min for 4 8bit ints.
 821 binop("umin_4x8", tint32, _2src_commutative + associative, """
 822 dst = 0;
 823 for (int i = 0; i < 32; i += 8) {
 824    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 825 }
 826 """)
 827
 828 # vector max for 4 8bit ints.
 829 binop("umax_4x8", tint32, _2src_commutative + associative, """
 830 dst = 0;
 831 for (int i = 0; i < 32; i += 8) {
 832    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 833 }
 834 """)
 835
 836 # unorm multiply: (a * b) / 255.
 837 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 838 dst = 0;
 839 for (int i = 0; i < 32; i += 8) {
 840    int src0_chan = (src0 >> i) & 0xff;
 841    int src1_chan = (src1 >> i) & 0xff;
 842    dst |= ((src0_chan * src1_chan) / 255) << i;
 843 }
 844 """)
 845
 846 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 847
 848 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 849             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 850
 851 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 852               "src0 | ((uint64_t)src1 << 32)")
 853
 854 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 855               "src0 | ((uint32_t)src1 << 16)")
 856
 857 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 858 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 859 # are from the low five bits of src0 and src1, respectively.
 860 binop_convert("bfm", tuint32, tint32, "", """
 861 int bits = src0 & 0x1F;
 862 int offset = src1 & 0x1F;
 863 dst = ((1u << bits) - 1) << offset;
 864 """)
 865
 866 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 867 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 868 /* flush denormals to zero. */
 869 if (!isnormal(dst))
 870    dst = copysignf(0.0f, src0);
 871 """)
 872
 873 # Combines the first component of each input to make a 2-component vector.
 874
 875 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 876 dst.x = src0.x;
 877 dst.y = src1.x;
 878 """)
 879
 880 # Byte extraction
 881 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 882 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 883
 884 # Word extraction
 885 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 886 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 887
 888
 889 def triop(name, ty, alg_props, const_expr):
 890    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 891 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 892    opcode(name, output_size, tuint,
 893    [src1_size, src2_size, src3_size],
 894    [tuint, tuint, tuint], False, "", const_expr)
 895
 896 triop("ffma", tfloat, _2src_commutative, """
 897 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 898    if (bit_size == 64)
 899       dst = _mesa_double_fma_rtz(src0, src1, src2);
 900    else if (bit_size == 32)
 901       dst = _mesa_float_fma_rtz(src0, src1, src2);
 902    else
 903       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 904 } else {
 905    if (bit_size == 32)
 906       dst = fmaf(src0, src1, src2);
 907    else
 908       dst = fma(src0, src1, src2);
 909 }
 910 """)
 911
 912 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 913
 914 # Conditional Select
 915 #
 916 # A vector conditional select instruction (like ?:, but operating per-
 917 # component on vectors). There are two versions, one for floating point
 918 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 919
 920
 921 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 922
 923 # 3 way min/max/med
 924 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 925 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 926 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 927
 928 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 929 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 930 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 931
 932 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 933 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 934 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 935
 936 opcode("bcsel", 0, tuint, [0, 0, 0],
 937       [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 938 opcode("b16csel", 0, tuint, [0, 0, 0],
 939        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 940 opcode("b32csel", 0, tuint, [0, 0, 0],
 941        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 942
 943 # SM5 bfi assembly
 944 triop("bfi", tuint32, "", """
 945 unsigned mask = src0, insert = src1, base = src2;
 946 if (mask == 0) {
 947    dst = base;
 948 } else {
 949    unsigned tmp = mask;
 950    while (!(tmp & 1)) {
 951       tmp >>= 1;
 952       insert <<= 1;
 953    }
 954    dst = (base & ~mask) | (insert & mask);
 955 }
 956 """)
 957
 958
 959 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 960
 961 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 962 opcode("ubfe", 0, tuint32,
 963        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 964 unsigned base = src0;
 965 unsigned offset = src1 & 0x1F;
 966 unsigned bits = src2 & 0x1F;
 967 if (bits == 0) {
 968    dst = 0;
 969 } else if (offset + bits < 32) {
 970    dst = (base << (32 - bits - offset)) >> (32 - bits);
 971 } else {
 972    dst = base >> offset;
 973 }
 974 """)
 975 opcode("ibfe", 0, tint32,
 976        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
 977 int base = src0;
 978 unsigned offset = src1 & 0x1F;
 979 unsigned bits = src2 & 0x1F;
 980 if (bits == 0) {
 981    dst = 0;
 982 } else if (offset + bits < 32) {
 983    dst = (base << (32 - bits - offset)) >> (32 - bits);
 984 } else {
 985    dst = base >> offset;
 986 }
 987 """)
 988
 989 # GLSL bitfieldExtract()
 990 opcode("ubitfield_extract", 0, tuint32,
 991        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
 992 unsigned base = src0;
 993 int offset = src1, bits = src2;
 994 if (bits == 0) {
 995    dst = 0;
 996 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 997    dst = 0; /* undefined per the spec */
 998 } else {
 999    dst = (base >> offset) & ((1ull << bits) - 1);
1000 }
1001 """)
1002 opcode("ibitfield_extract", 0, tint32,
1003        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1004 int base = src0;
1005 int offset = src1, bits = src2;
1006 if (bits == 0) {
1007    dst = 0;
1008 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1009    dst = 0;
1010 } else {
1011    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1012 }
1013 """)
1014
1015 # Combines the first component of each input to make a 3-component vector.
1016
1017 triop_horiz("vec3", 3, 1, 1, 1, """
1018 dst.x = src0.x;
1019 dst.y = src1.x;
1020 dst.z = src2.x;
1021 """)
1022
1023 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1024                  src4_size, const_expr):
1025    opcode(name, output_size, tuint,
1026           [src1_size, src2_size, src3_size, src4_size],
1027           [tuint, tuint, tuint, tuint],
1028           False, "", const_expr)
1029
1030 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1031        [tuint32, tuint32, tint32, tint32], False, "", """
1032 unsigned base = src0, insert = src1;
1033 int offset = src2, bits = src3;
1034 if (bits == 0) {
1035    dst = base;
1036 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1037    dst = 0;
1038 } else {
1039    unsigned mask = ((1ull << bits) - 1) << offset;
1040    dst = (base & ~mask) | ((insert << offset) & mask);
1041 }
1042 """)
1043
1044 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1045 dst.x = src0.x;
1046 dst.y = src1.x;
1047 dst.z = src2.x;
1048 dst.w = src3.x;
1049 """)
1050
1051 # An integer multiply instruction for address calculation.  This is
1052 # similar to imul, except that the results are undefined in case of
1053 # overflow.  Overflow is defined according to the size of the variable
1054 # being dereferenced.
1055 #
1056 # This relaxed definition, compared to imul, allows an optimization
1057 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1058 # sources, such that lower precision integer multiplies can be used.
1059 # This is useful on hw that has 24b or perhaps 16b integer multiply
1060 # instructions.
1061 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1062
1063 # ir3-specific instruction that maps directly to mul-add shift high mix,
1064 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1065 # multiplication (imul) on Freedreno backend..
1066 opcode("imadsh_mix16", 1, tint32,
1067        [1, 1, 1], [tint32, tint32, tint32], False, "", """
1068 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
1069 """)
1070
1071 # ir3-specific instruction that maps directly to ir3 mad.s24.
1072 #
1073 # 24b multiply into 32b result (with sign extension) plus 32b int
1074 triop("imad24_ir3", tint32, _2src_commutative,
1075       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1076
1077 # 24b multiply into 32b result (with sign extension)
1078 binop("imul24", tint32, _2src_commutative + associative,
1079       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")