src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool32 = "bool32"
  99 tuint = "uint"
 100 tuint16 = "uint16"
 101 tfloat32 = "float32"
 102 tint32 = "int32"
 103 tuint32 = "uint32"
 104 tint64 = "int64"
 105 tuint64 = "uint64"
 106 tfloat64 = "float64"
 107
 108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 109
 110 def type_has_size(type_):
 111     m = _TYPE_SPLIT_RE.match(type_)
 112     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 113     return m.group('bits') is not None
 114
 115 def type_size(type_):
 116     m = _TYPE_SPLIT_RE.match(type_)
 117     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 118     assert m.group('bits') is not None, \
 119            'NIR type string has no bit size: "{}"'.format(type_)
 120     return int(m.group('bits'))
 121
 122 def type_sizes(type_):
 123     if type_has_size(type_):
 124         return [type_size(type_)]
 125     elif type_ == 'bool':
 126         return [1, 32]
 127     elif type_ == 'float':
 128         return [16, 32, 64]
 129     else:
 130         return [1, 8, 16, 32, 64]
 131
 132 def type_base_type(type_):
 133     m = _TYPE_SPLIT_RE.match(type_)
 134     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 135     return m.group('type')
 136
 137 # Operation where the first two sources are commutative.
 138 #
 139 # For 2-source operations, this just mathematical commutativity.  Some
 140 # 3-source operations, like ffma, are only commutative in the first two
 141 # sources.
 142 _2src_commutative = "2src_commutative "
 143 associative = "associative "
 144
 145 # global dictionary of opcodes
 146 opcodes = {}
 147
 148 def opcode(name, output_size, output_type, input_sizes, input_types,
 149            is_conversion, algebraic_properties, const_expr):
 150    assert name not in opcodes
 151    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 152                           input_types, is_conversion, algebraic_properties,
 153                           const_expr)
 154
 155 def unop_convert(name, out_type, in_type, const_expr):
 156    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 157
 158 def unop(name, ty, const_expr):
 159    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 160
 161 def unop_horiz(name, output_size, output_type, input_size, input_type,
 162                const_expr):
 163    opcode(name, output_size, output_type, [input_size], [input_type],
 164           False, "", const_expr)
 165
 166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 167                 reduce_expr, final_expr):
 168    def prereduce(src):
 169       return "(" + prereduce_expr.format(src=src) + ")"
 170    def final(src):
 171       return final_expr.format(src="(" + src + ")")
 172    def reduce_(src0, src1):
 173       return reduce_expr.format(src0=src0, src1=src1)
 174    src0 = prereduce("src0.x")
 175    src1 = prereduce("src0.y")
 176    src2 = prereduce("src0.z")
 177    src3 = prereduce("src0.w")
 178    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 179               final(reduce_(src0, src1)))
 180    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 181               final(reduce_(reduce_(src0, src1), src2)))
 182    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 183               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 184
 185 def unop_numeric_convert(name, out_type, in_type, const_expr):
 186    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 187
 188 # These two move instructions differ in what modifiers they support and what
 189 # the negate modifier means. Otherwise, they are identical.
 190 unop("fmov", tfloat, "src0")
 191 unop("imov", tint, "src0")
 192
 193 unop("ineg", tint, "-src0")
 194 unop("fneg", tfloat, "-src0")
 195 unop("inot", tint, "~src0") # invert every bit of the integer
 196 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
 197                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
 198 unop("fsign", tfloat, ("bit_size == 64 ? " +
 199                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 200                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 201 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 202 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 203 unop("fabs", tfloat, "fabs(src0)")
 204 unop("fsat", tfloat, ("bit_size == 64 ? " +
 205                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 206                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 207 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 208 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 209 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 210 unop("fexp2", tfloat, "exp2f(src0)")
 211 unop("flog2", tfloat, "log2f(src0)")
 212
 213 # Generate all of the numeric conversion opcodes
 214 for src_t in [tint, tuint, tfloat, tbool]:
 215    if src_t == tbool:
 216       dst_types = [tfloat, tint]
 217    elif src_t == tint:
 218       dst_types = [tfloat, tint, tbool]
 219    elif src_t == tuint:
 220       dst_types = [tfloat, tuint]
 221    elif src_t == tfloat:
 222       dst_types = [tint, tuint, tfloat, tbool]
 223
 224    for dst_t in dst_types:
 225       for bit_size in type_sizes(dst_t):
 226           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 227               rnd_modes = ['_rtne', '_rtz', '']
 228               for rnd_mode in rnd_modes:
 229                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
 230                                                               bit_size, rnd_mode),
 231                                        dst_t + str(bit_size), src_t, "src0")
 232           else:
 233               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 234               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 235                                    dst_t + str(bit_size), src_t, conv_expr)
 236
 237
 238 # Unary floating-point rounding operations.
 239
 240
 241 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 242 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 243 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 244 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 245 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 246
 247 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 248
 249 # Trigonometric operations.
 250
 251
 252 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 253 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 254
 255 # dfrexp
 256 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 257 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 258
 259 # Partial derivatives.
 260
 261
 262 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 263 unop("fddy", tfloat, "0.0")
 264 unop("fddx_fine", tfloat, "0.0")
 265 unop("fddy_fine", tfloat, "0.0")
 266 unop("fddx_coarse", tfloat, "0.0")
 267 unop("fddy_coarse", tfloat, "0.0")
 268
 269
 270 # Floating point pack and unpack operations.
 271
 272 def pack_2x16(fmt):
 273    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 274 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 275 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 276 """.replace("fmt", fmt))
 277
 278 def pack_4x8(fmt):
 279    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 280 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 281 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 282 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 283 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 284 """.replace("fmt", fmt))
 285
 286 def unpack_2x16(fmt):
 287    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 288 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 289 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 290 """.replace("fmt", fmt))
 291
 292 def unpack_4x8(fmt):
 293    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 294 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 295 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 296 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 297 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 298 """.replace("fmt", fmt))
 299
 300
 301 pack_2x16("snorm")
 302 pack_4x8("snorm")
 303 pack_2x16("unorm")
 304 pack_4x8("unorm")
 305 pack_2x16("half")
 306 unpack_2x16("snorm")
 307 unpack_4x8("snorm")
 308 unpack_2x16("unorm")
 309 unpack_4x8("unorm")
 310 unpack_2x16("half")
 311
 312 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 313 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 314 """)
 315
 316 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 317 dst.x = (src0.x <<  0) |
 318         (src0.y <<  8) |
 319         (src0.z << 16) |
 320         (src0.w << 24);
 321 """)
 322
 323 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 324            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 325
 326 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 327            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 328
 329 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 330            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 331
 332 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 333            "dst.x = src0.x; dst.y = src0.x >> 32;")
 334
 335 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 336            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 337
 338 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 339            "dst.x = src0.x; dst.y = src0.x >> 16;")
 340
 341 # Lowered floating point unpacking operations.
 342
 343
 344 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 345              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 346 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 347              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 348
 349 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 350 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 351
 352 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 353 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 354
 355 # Bit operations, part of ARB_gpu_shader5.
 356
 357
 358 unop("bitfield_reverse", tuint32, """
 359 /* we're not winning any awards for speed here, but that's ok */
 360 dst = 0;
 361 for (unsigned bit = 0; bit < 32; bit++)
 362    dst |= ((src0 >> bit) & 1) << (31 - bit);
 363 """)
 364 unop_convert("bit_count", tuint32, tuint, """
 365 dst = 0;
 366 for (unsigned bit = 0; bit < bit_size; bit++) {
 367    if ((src0 >> bit) & 1)
 368       dst++;
 369 }
 370 """)
 371
 372 unop_convert("ufind_msb", tint32, tuint, """
 373 dst = -1;
 374 for (int bit = bit_size - 1; bit >= 0; bit--) {
 375    if ((src0 >> bit) & 1) {
 376       dst = bit;
 377       break;
 378    }
 379 }
 380 """)
 381
 382 unop("ifind_msb", tint32, """
 383 dst = -1;
 384 for (int bit = 31; bit >= 0; bit--) {
 385    /* If src0 < 0, we're looking for the first 0 bit.
 386     * if src0 >= 0, we're looking for the first 1 bit.
 387     */
 388    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 389       (!((src0 >> bit) & 1) && (src0 < 0))) {
 390       dst = bit;
 391       break;
 392    }
 393 }
 394 """)
 395
 396 unop_convert("find_lsb", tint32, tint, """
 397 dst = -1;
 398 for (unsigned bit = 0; bit < bit_size; bit++) {
 399    if ((src0 >> bit) & 1) {
 400       dst = bit;
 401       break;
 402    }
 403 }
 404 """)
 405
 406
 407 for i in range(1, 5):
 408    for j in range(1, 5):
 409       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 410
 411
 412 # AMD_gcn_shader extended instructions
 413 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 414 dst.x = dst.y = 0.0;
 415 float absX = fabs(src0.x);
 416 float absY = fabs(src0.y);
 417 float absZ = fabs(src0.z);
 418
 419 float ma = 0.0;
 420 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 421 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 422 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 423
 424 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 425 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 426 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 427 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 428 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 429 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 430
 431 dst.x = dst.x / ma + 0.5;
 432 dst.y = dst.y / ma + 0.5;
 433 """)
 434
 435 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 436 float absX = fabs(src0.x);
 437 float absY = fabs(src0.y);
 438 float absZ = fabs(src0.z);
 439 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 440 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 441 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 442 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 443 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 444 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 445 """)
 446
 447
 448 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 449    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 450           False, alg_props, const_expr)
 451
 452 def binop(name, ty, alg_props, const_expr):
 453    binop_convert(name, ty, ty, alg_props, const_expr)
 454
 455 def binop_compare(name, ty, alg_props, const_expr):
 456    binop_convert(name, tbool1, ty, alg_props, const_expr)
 457
 458 def binop_compare32(name, ty, alg_props, const_expr):
 459    binop_convert(name, tbool32, ty, alg_props, const_expr)
 460
 461 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 462                 src2_type, const_expr):
 463    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 464           False, "", const_expr)
 465
 466 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 467                  reduce_expr, final_expr):
 468    def final(src):
 469       return final_expr.format(src= "(" + src + ")")
 470    def reduce_(src0, src1):
 471       return reduce_expr.format(src0=src0, src1=src1)
 472    def prereduce(src0, src1):
 473       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 474    src0 = prereduce("src0.x", "src1.x")
 475    src1 = prereduce("src0.y", "src1.y")
 476    src2 = prereduce("src0.z", "src1.z")
 477    src3 = prereduce("src0.w", "src1.w")
 478    opcode(name + "2", output_size, output_type,
 479           [2, 2], [src_type, src_type], False, _2src_commutative,
 480           final(reduce_(src0, src1)))
 481    opcode(name + "3", output_size, output_type,
 482           [3, 3], [src_type, src_type], False, _2src_commutative,
 483           final(reduce_(reduce_(src0, src1), src2)))
 484    opcode(name + "4", output_size, output_type,
 485           [4, 4], [src_type, src_type], False, _2src_commutative,
 486           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 487
 488 binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1")
 489 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 490 binop("iadd_sat", tint, _2src_commutative, """
 491       src1 > 0 ?
 492          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 493          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 494 """)
 495 binop("uadd_sat", tuint, _2src_commutative,
 496       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 497 binop("isub_sat", tint, "", """
 498       src1 < 0 ?
 499          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 500          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 501 """)
 502 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 503
 504 binop("fsub", tfloat, "", "src0 - src1")
 505 binop("isub", tint, "", "src0 - src1")
 506
 507 binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1")
 508 # low 32-bits of signed/unsigned integer multiply
 509 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 510
 511 # Generate 64 bit result from 2 32 bits quantity
 512 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 513               "(int64_t)src0 * (int64_t)src1")
 514 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 515               "(uint64_t)src0 * (uint64_t)src1")
 516
 517 # high 32-bits of signed integer multiply
 518 binop("imul_high", tint, _2src_commutative, """
 519 if (bit_size == 64) {
 520    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 521     * extension to work properly.  The casts are kind-of annoying but needed
 522     * to prevent compiler warnings.
 523     */
 524    uint32_t src0_u32[4] = {
 525       src0,
 526       (int64_t)src0 >> 32,
 527       (int64_t)src0 >> 63,
 528       (int64_t)src0 >> 63,
 529    };
 530    uint32_t src1_u32[4] = {
 531       src1,
 532       (int64_t)src1 >> 32,
 533       (int64_t)src1 >> 63,
 534       (int64_t)src1 >> 63,
 535    };
 536    uint32_t prod_u32[4];
 537    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 538    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 539 } else {
 540    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 541 }
 542 """)
 543
 544 # high 32-bits of unsigned integer multiply
 545 binop("umul_high", tuint, _2src_commutative, """
 546 if (bit_size == 64) {
 547    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 548    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 549    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 550    uint32_t prod_u32[4];
 551    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 552    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 553 } else {
 554    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 555 }
 556 """)
 557
 558 binop("fdiv", tfloat, "", "src0 / src1")
 559 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 560 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 561
 562 # returns a boolean representing the carry resulting from the addition of
 563 # the two unsigned arguments.
 564
 565 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 566
 567 # returns a boolean representing the borrow resulting from the subtraction
 568 # of the two unsigned arguments.
 569
 570 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 571
 572 # hadd: (a + b) >> 1 (without overflow)
 573 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 574 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 575 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 576 #       =     ((x & y) << 1) + (x ^ y)
 577 #
 578 # Since we know that the bottom bit of (x & y) << 1 is zero,
 579 #
 580 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 581 #              =   (x & y) +      ((x ^ y)  >> 1)
 582 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 583 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 584
 585 # rhadd: (a + b + 1) >> 1 (without overflow)
 586 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 587 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 588 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 589 #           =     ((x | y) << 1) - (x ^ y) + 1
 590 #
 591 # Since we know that the bottom bit of (x & y) << 1 is zero,
 592 #
 593 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 594 #                  = (x | y) -  ((x ^ y)      >> 1)
 595 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 596 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 597
 598 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 599
 600 # For signed integers, there are several different possible definitions of
 601 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 602 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 603 # operation while the imod opcode implements the more mathematical
 604 # "modulus" operation.  For details on the difference, see
 605 #
 606 # http://mathforum.org/library/drmath/view/52343.html
 607
 608 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 609 binop("imod", tint, "",
 610       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 611       "                 src0 % src1 : src0 % src1 + src1)")
 612 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 613 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 614
 615 #
 616 # Comparisons
 617 #
 618
 619
 620 # these integer-aware comparisons return a boolean (0 or ~0)
 621
 622 binop_compare("flt", tfloat, "", "src0 < src1")
 623 binop_compare("fge", tfloat, "", "src0 >= src1")
 624 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
 625 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
 626 binop_compare("ilt", tint, "", "src0 < src1")
 627 binop_compare("ige", tint, "", "src0 >= src1")
 628 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
 629 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
 630 binop_compare("ult", tuint, "", "src0 < src1")
 631 binop_compare("uge", tuint, "", "src0 >= src1")
 632 binop_compare32("flt32", tfloat, "", "src0 < src1")
 633 binop_compare32("fge32", tfloat, "", "src0 >= src1")
 634 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
 635 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
 636 binop_compare32("ilt32", tint, "", "src0 < src1")
 637 binop_compare32("ige32", tint, "", "src0 >= src1")
 638 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
 639 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
 640 binop_compare32("ult32", tuint, "", "src0 < src1")
 641 binop_compare32("uge32", tuint, "", "src0 >= src1")
 642
 643 # integer-aware GLSL-style comparisons that compare floats and ints
 644
 645 binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
 646              "{src0} && {src1}", "{src}")
 647 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
 648              "{src0} || {src1}", "{src}")
 649 binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
 650              "{src0} && {src1}", "{src}")
 651 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
 652              "{src0} || {src1}", "{src}")
 653
 654 binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 655              "{src0} && {src1}", "{src}")
 656 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 657              "{src0} || {src1}", "{src}")
 658 binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
 659              "{src0} && {src1}", "{src}")
 660 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
 661              "{src0} || {src1}", "{src}")
 662
 663 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 664
 665 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 666              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 667 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 668              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 669
 670 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 671 # and false respectively
 672
 673 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 674 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 675 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 676 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 677
 678 # SPIRV shifts are undefined for shift-operands >= bitsize,
 679 # but SM5 shifts are defined to use the least significant bits, only
 680 # The NIR definition is according to the SM5 specification.
 681 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 682        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 683 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 684        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 685 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 686        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 687
 688 # bitwise logic operators
 689 #
 690 # These are also used as boolean and, or, xor for hardware supporting
 691 # integers.
 692
 693
 694 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 695 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 696 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 697
 698
 699 # floating point logic operators
 700 #
 701 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 702 # for true and 0.0 for false
 703
 704 binop("fand", tfloat32, _2src_commutative,
 705       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 706 binop("for", tfloat32, _2src_commutative,
 707       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 708 binop("fxor", tfloat32, _2src_commutative,
 709       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 710
 711 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 712              "{src}")
 713
 714 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 715              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 716
 717 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 718        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 719 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 720        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 721
 722 binop("fmin", tfloat, "", "fminf(src0, src1)")
 723 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 724 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 725 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 726 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 727 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 728
 729 # Saturated vector add for 4 8bit ints.
 730 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 731 dst = 0;
 732 for (int i = 0; i < 32; i += 8) {
 733    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 734 }
 735 """)
 736
 737 # Saturated vector subtract for 4 8bit ints.
 738 binop("ussub_4x8", tint32, "", """
 739 dst = 0;
 740 for (int i = 0; i < 32; i += 8) {
 741    int src0_chan = (src0 >> i) & 0xff;
 742    int src1_chan = (src1 >> i) & 0xff;
 743    if (src0_chan > src1_chan)
 744       dst |= (src0_chan - src1_chan) << i;
 745 }
 746 """)
 747
 748 # vector min for 4 8bit ints.
 749 binop("umin_4x8", tint32, _2src_commutative + associative, """
 750 dst = 0;
 751 for (int i = 0; i < 32; i += 8) {
 752    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 753 }
 754 """)
 755
 756 # vector max for 4 8bit ints.
 757 binop("umax_4x8", tint32, _2src_commutative + associative, """
 758 dst = 0;
 759 for (int i = 0; i < 32; i += 8) {
 760    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 761 }
 762 """)
 763
 764 # unorm multiply: (a * b) / 255.
 765 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 766 dst = 0;
 767 for (int i = 0; i < 32; i += 8) {
 768    int src0_chan = (src0 >> i) & 0xff;
 769    int src1_chan = (src1 >> i) & 0xff;
 770    dst |= ((src0_chan * src1_chan) / 255) << i;
 771 }
 772 """)
 773
 774 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 775
 776 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 777             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 778
 779 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 780               "src0 | ((uint64_t)src1 << 32)")
 781
 782 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 783               "src0 | ((uint32_t)src1 << 16)")
 784
 785 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 786 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 787 # if either of its arguments are 32.
 788 binop_convert("bfm", tuint32, tint32, "", """
 789 int bits = src0, offset = src1;
 790 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 791    dst = 0; /* undefined */
 792 else
 793    dst = ((1u << bits) - 1) << offset;
 794 """)
 795
 796 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 797 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 798 /* flush denormals to zero. */
 799 if (!isnormal(dst))
 800    dst = copysignf(0.0f, src0);
 801 """)
 802
 803 # Combines the first component of each input to make a 2-component vector.
 804
 805 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 806 dst.x = src0.x;
 807 dst.y = src1.x;
 808 """)
 809
 810 # Byte extraction
 811 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 812 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 813
 814 # Word extraction
 815 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 816 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 817
 818
 819 def triop(name, ty, alg_props, const_expr):
 820    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 821 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 822    opcode(name, output_size, tuint,
 823    [src1_size, src2_size, src3_size],
 824    [tuint, tuint, tuint], False, "", const_expr)
 825
 826 triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2")
 827
 828 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 829
 830 # Conditional Select
 831 #
 832 # A vector conditional select instruction (like ?:, but operating per-
 833 # component on vectors). There are two versions, one for floating point
 834 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 835
 836
 837 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 838
 839 # 3 way min/max/med
 840 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 841 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 842 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 843
 844 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 845 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 846 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 847
 848 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 849 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 850 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 851
 852 opcode("bcsel", 0, tuint, [0, 0, 0],
 853       [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 854 opcode("b32csel", 0, tuint, [0, 0, 0],
 855        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 856
 857 # SM5 bfi assembly
 858 triop("bfi", tuint32, "", """
 859 unsigned mask = src0, insert = src1, base = src2;
 860 if (mask == 0) {
 861    dst = base;
 862 } else {
 863    unsigned tmp = mask;
 864    while (!(tmp & 1)) {
 865       tmp >>= 1;
 866       insert <<= 1;
 867    }
 868    dst = (base & ~mask) | (insert & mask);
 869 }
 870 """)
 871
 872 # SM5 ubfe/ibfe assembly
 873 opcode("ubfe", 0, tuint32,
 874        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
 875 unsigned base = src0;
 876 int offset = src1, bits = src2;
 877 if (bits == 0) {
 878    dst = 0;
 879 } else if (bits < 0 || offset < 0) {
 880    dst = 0; /* undefined */
 881 } else if (offset + bits < 32) {
 882    dst = (base << (32 - bits - offset)) >> (32 - bits);
 883 } else {
 884    dst = base >> offset;
 885 }
 886 """)
 887 opcode("ibfe", 0, tint32,
 888        [0, 0, 0], [tint32, tint32, tint32], False, "", """
 889 int base = src0;
 890 int offset = src1, bits = src2;
 891 if (bits == 0) {
 892    dst = 0;
 893 } else if (bits < 0 || offset < 0) {
 894    dst = 0; /* undefined */
 895 } else if (offset + bits < 32) {
 896    dst = (base << (32 - bits - offset)) >> (32 - bits);
 897 } else {
 898    dst = base >> offset;
 899 }
 900 """)
 901
 902 # GLSL bitfieldExtract()
 903 opcode("ubitfield_extract", 0, tuint32,
 904        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
 905 unsigned base = src0;
 906 int offset = src1, bits = src2;
 907 if (bits == 0) {
 908    dst = 0;
 909 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 910    dst = 0; /* undefined per the spec */
 911 } else {
 912    dst = (base >> offset) & ((1ull << bits) - 1);
 913 }
 914 """)
 915 opcode("ibitfield_extract", 0, tint32,
 916        [0, 0, 0], [tint32, tint32, tint32], False, "", """
 917 int base = src0;
 918 int offset = src1, bits = src2;
 919 if (bits == 0) {
 920    dst = 0;
 921 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 922    dst = 0;
 923 } else {
 924    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 925 }
 926 """)
 927
 928 # Combines the first component of each input to make a 3-component vector.
 929
 930 triop_horiz("vec3", 3, 1, 1, 1, """
 931 dst.x = src0.x;
 932 dst.y = src1.x;
 933 dst.z = src2.x;
 934 """)
 935
 936 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 937                  src4_size, const_expr):
 938    opcode(name, output_size, tuint,
 939           [src1_size, src2_size, src3_size, src4_size],
 940           [tuint, tuint, tuint, tuint],
 941           False, "", const_expr)
 942
 943 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 944        [tuint32, tuint32, tint32, tint32], False, "", """
 945 unsigned base = src0, insert = src1;
 946 int offset = src2, bits = src3;
 947 if (bits == 0) {
 948    dst = base;
 949 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 950    dst = 0;
 951 } else {
 952    unsigned mask = ((1ull << bits) - 1) << offset;
 953    dst = (base & ~mask) | ((insert << offset) & mask);
 954 }
 955 """)
 956
 957 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 958 dst.x = src0.x;
 959 dst.y = src1.x;
 960 dst.z = src2.x;
 961 dst.w = src3.x;
 962 """)
 963
 964