src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool"
  94 tbool32 = "bool32"
  95 tuint = "uint"
  96 tuint16 = "uint16"
  97 tfloat32 = "float32"
  98 tint32 = "int32"
  99 tuint32 = "uint32"
 100 tint64 = "int64"
 101 tuint64 = "uint64"
 102 tfloat64 = "float64"
 103
 104 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 105
 106 def type_has_size(type_):
 107     m = _TYPE_SPLIT_RE.match(type_)
 108     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 109     return m.group('bits') is not None
 110
 111 def type_size(type_):
 112     m = _TYPE_SPLIT_RE.match(type_)
 113     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 114     assert m.group('bits') is not None, \
 115            'NIR type string has no bit size: "{}"'.format(type_)
 116     return int(m.group('bits'))
 117
 118 def type_sizes(type_):
 119     if type_has_size(type_):
 120         return [type_size(type_)]
 121     elif type_ == 'bool':
 122         return [32]
 123     elif type_ == 'float':
 124         return [16, 32, 64]
 125     else:
 126         return [8, 16, 32, 64]
 127
 128 def type_base_type(type_):
 129     m = _TYPE_SPLIT_RE.match(type_)
 130     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 131     return m.group('type')
 132
 133 commutative = "commutative "
 134 associative = "associative "
 135
 136 # global dictionary of opcodes
 137 opcodes = {}
 138
 139 def opcode(name, output_size, output_type, input_sizes, input_types,
 140            algebraic_properties, const_expr):
 141    assert name not in opcodes
 142    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 143                           input_types, algebraic_properties, const_expr)
 144
 145 def unop_convert(name, out_type, in_type, const_expr):
 146    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 147
 148 def unop(name, ty, const_expr):
 149    opcode(name, 0, ty, [0], [ty], "", const_expr)
 150
 151 def unop_horiz(name, output_size, output_type, input_size, input_type,
 152                const_expr):
 153    opcode(name, output_size, output_type, [input_size], [input_type], "",
 154           const_expr)
 155
 156 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 157                 reduce_expr, final_expr):
 158    def prereduce(src):
 159       return "(" + prereduce_expr.format(src=src) + ")"
 160    def final(src):
 161       return final_expr.format(src="(" + src + ")")
 162    def reduce_(src0, src1):
 163       return reduce_expr.format(src0=src0, src1=src1)
 164    src0 = prereduce("src0.x")
 165    src1 = prereduce("src0.y")
 166    src2 = prereduce("src0.z")
 167    src3 = prereduce("src0.w")
 168    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 169               final(reduce_(src0, src1)))
 170    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 171               final(reduce_(reduce_(src0, src1), src2)))
 172    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 173               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 174
 175
 176 # These two move instructions differ in what modifiers they support and what
 177 # the negate modifier means. Otherwise, they are identical.
 178 unop("fmov", tfloat, "src0")
 179 unop("imov", tint, "src0")
 180
 181 unop("ineg", tint, "-src0")
 182 unop("fneg", tfloat, "-src0")
 183 unop("inot", tint, "~src0") # invert every bit of the integer
 184 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
 185                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
 186 unop("fsign", tfloat, ("bit_size == 64 ? " +
 187                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 188                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 189 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 190 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 191 unop("fabs", tfloat, "fabs(src0)")
 192 unop("fsat", tfloat, ("bit_size == 64 ? " +
 193                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 194                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 195 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 196 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 197 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 198 unop("fexp2", tfloat, "exp2f(src0)")
 199 unop("flog2", tfloat, "log2f(src0)")
 200
 201 # Generate all of the numeric conversion opcodes
 202 for src_t in [tint, tuint, tfloat, tbool]:
 203    if src_t == tbool:
 204       dst_types = [tfloat, tint]
 205    elif src_t == tint:
 206       dst_types = [tfloat, tint, tbool]
 207    elif src_t == tuint:
 208       dst_types = [tfloat, tuint]
 209    elif src_t == tfloat:
 210       dst_types = [tint, tuint, tfloat, tbool]
 211
 212    for dst_t in dst_types:
 213       for bit_size in type_sizes(dst_t):
 214           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 215               rnd_modes = ['_rtne', '_rtz', '']
 216               for rnd_mode in rnd_modes:
 217                   unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
 218                                                        bit_size, rnd_mode),
 219                                dst_t + str(bit_size), src_t, "src0")
 220           else:
 221               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 222               unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 223                            dst_t + str(bit_size), src_t, conv_expr)
 224
 225
 226 # Unary floating-point rounding operations.
 227
 228
 229 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 230 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 231 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 232 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 233 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 234
 235 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 236
 237 # Trigonometric operations.
 238
 239
 240 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 241 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 242
 243 # dfrexp
 244 unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);")
 245 unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);")
 246
 247 # Partial derivatives.
 248
 249
 250 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 251 unop("fddy", tfloat, "0.0")
 252 unop("fddx_fine", tfloat, "0.0")
 253 unop("fddy_fine", tfloat, "0.0")
 254 unop("fddx_coarse", tfloat, "0.0")
 255 unop("fddy_coarse", tfloat, "0.0")
 256
 257
 258 # Floating point pack and unpack operations.
 259
 260 def pack_2x16(fmt):
 261    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 262 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 263 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 264 """.replace("fmt", fmt))
 265
 266 def pack_4x8(fmt):
 267    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 268 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 269 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 270 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 271 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 272 """.replace("fmt", fmt))
 273
 274 def unpack_2x16(fmt):
 275    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 276 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 277 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 278 """.replace("fmt", fmt))
 279
 280 def unpack_4x8(fmt):
 281    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 282 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 283 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 284 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 285 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 286 """.replace("fmt", fmt))
 287
 288
 289 pack_2x16("snorm")
 290 pack_4x8("snorm")
 291 pack_2x16("unorm")
 292 pack_4x8("unorm")
 293 pack_2x16("half")
 294 unpack_2x16("snorm")
 295 unpack_4x8("snorm")
 296 unpack_2x16("unorm")
 297 unpack_4x8("unorm")
 298 unpack_2x16("half")
 299
 300 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 301 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 302 """)
 303
 304 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 305 dst.x = (src0.x <<  0) |
 306         (src0.y <<  8) |
 307         (src0.z << 16) |
 308         (src0.w << 24);
 309 """)
 310
 311 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 312            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 313
 314 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 315            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 316
 317 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 318            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 319
 320 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 321            "dst.x = src0.x; dst.y = src0.x >> 32;")
 322
 323 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 324            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 325
 326 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 327            "dst.x = src0.x; dst.y = src0.x >> 16;")
 328
 329 # Lowered floating point unpacking operations.
 330
 331
 332 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 333              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 334 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 335              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 336
 337 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 338 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 339
 340 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 341 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 342
 343 # Bit operations, part of ARB_gpu_shader5.
 344
 345
 346 unop("bitfield_reverse", tuint32, """
 347 /* we're not winning any awards for speed here, but that's ok */
 348 dst = 0;
 349 for (unsigned bit = 0; bit < 32; bit++)
 350    dst |= ((src0 >> bit) & 1) << (31 - bit);
 351 """)
 352 unop_convert("bit_count", tuint32, tuint, """
 353 dst = 0;
 354 for (unsigned bit = 0; bit < bit_size; bit++) {
 355    if ((src0 >> bit) & 1)
 356       dst++;
 357 }
 358 """)
 359
 360 unop_convert("ufind_msb", tint32, tuint, """
 361 dst = -1;
 362 for (int bit = bit_size - 1; bit >= 0; bit--) {
 363    if ((src0 >> bit) & 1) {
 364       dst = bit;
 365       break;
 366    }
 367 }
 368 """)
 369
 370 unop("ifind_msb", tint32, """
 371 dst = -1;
 372 for (int bit = 31; bit >= 0; bit--) {
 373    /* If src0 < 0, we're looking for the first 0 bit.
 374     * if src0 >= 0, we're looking for the first 1 bit.
 375     */
 376    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 377       (!((src0 >> bit) & 1) && (src0 < 0))) {
 378       dst = bit;
 379       break;
 380    }
 381 }
 382 """)
 383
 384 unop_convert("find_lsb", tint32, tint, """
 385 dst = -1;
 386 for (unsigned bit = 0; bit < bit_size; bit++) {
 387    if ((src0 >> bit) & 1) {
 388       dst = bit;
 389       break;
 390    }
 391 }
 392 """)
 393
 394
 395 for i in range(1, 5):
 396    for j in range(1, 5):
 397       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 398
 399
 400 # AMD_gcn_shader extended instructions
 401 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 402 dst.x = dst.y = 0.0;
 403 float absX = fabs(src0.x);
 404 float absY = fabs(src0.y);
 405 float absZ = fabs(src0.z);
 406 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
 407 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
 408 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
 409 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
 410 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
 411 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
 412 """)
 413
 414 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 415 float absX = fabs(src0.x);
 416 float absY = fabs(src0.y);
 417 float absZ = fabs(src0.z);
 418 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 419 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 420 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 421 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 422 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 423 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 424 """)
 425
 426
 427 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 428    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 429
 430 def binop(name, ty, alg_props, const_expr):
 431    binop_convert(name, ty, ty, alg_props, const_expr)
 432
 433 def binop_compare(name, ty, alg_props, const_expr):
 434    binop_convert(name, tbool32, ty, alg_props, const_expr)
 435
 436 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 437                 src2_type, const_expr):
 438    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 439           "", const_expr)
 440
 441 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 442                  reduce_expr, final_expr):
 443    def final(src):
 444       return final_expr.format(src= "(" + src + ")")
 445    def reduce_(src0, src1):
 446       return reduce_expr.format(src0=src0, src1=src1)
 447    def prereduce(src0, src1):
 448       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 449    src0 = prereduce("src0.x", "src1.x")
 450    src1 = prereduce("src0.y", "src1.y")
 451    src2 = prereduce("src0.z", "src1.z")
 452    src3 = prereduce("src0.w", "src1.w")
 453    opcode(name + "2", output_size, output_type,
 454           [2, 2], [src_type, src_type], commutative,
 455           final(reduce_(src0, src1)))
 456    opcode(name + "3", output_size, output_type,
 457           [3, 3], [src_type, src_type], commutative,
 458           final(reduce_(reduce_(src0, src1), src2)))
 459    opcode(name + "4", output_size, output_type,
 460           [4, 4], [src_type, src_type], commutative,
 461           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 462
 463 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 464 binop("iadd", tint, commutative + associative, "src0 + src1")
 465 binop("uadd_sat", tuint, commutative,
 466       "(src0 + src1) < src0 ? UINT64_MAX : (src0 + src1)")
 467 binop("fsub", tfloat, "", "src0 - src1")
 468 binop("isub", tint, "", "src0 - src1")
 469
 470 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 471 # low 32-bits of signed/unsigned integer multiply
 472 binop("imul", tint, commutative + associative, "src0 * src1")
 473
 474 # high 32-bits of signed integer multiply
 475 binop("imul_high", tint, commutative, """
 476 if (bit_size == 64) {
 477    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 478     * extension to work properly.  The casts are kind-of annoying but needed
 479     * to prevent compiler warnings.
 480     */
 481    uint32_t src0_u32[4] = {
 482       src0,
 483       (int64_t)src0 >> 32,
 484       (int64_t)src0 >> 63,
 485       (int64_t)src0 >> 63,
 486    };
 487    uint32_t src1_u32[4] = {
 488       src1,
 489       (int64_t)src1 >> 32,
 490       (int64_t)src1 >> 63,
 491       (int64_t)src1 >> 63,
 492    };
 493    uint32_t prod_u32[4];
 494    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 495    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 496 } else {
 497    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 498 }
 499 """)
 500
 501 # high 32-bits of unsigned integer multiply
 502 binop("umul_high", tuint, commutative, """
 503 if (bit_size == 64) {
 504    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 505    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 506    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 507    uint32_t prod_u32[4];
 508    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 509    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 510 } else {
 511    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 512 }
 513 """)
 514
 515 binop("fdiv", tfloat, "", "src0 / src1")
 516 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 517 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 518
 519 # returns a boolean representing the carry resulting from the addition of
 520 # the two unsigned arguments.
 521
 522 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 523
 524 # returns a boolean representing the borrow resulting from the subtraction
 525 # of the two unsigned arguments.
 526
 527 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 528
 529 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 530
 531 # For signed integers, there are several different possible definitions of
 532 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 533 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 534 # operation while the imod opcode implements the more mathematical
 535 # "modulus" operation.  For details on the difference, see
 536 #
 537 # http://mathforum.org/library/drmath/view/52343.html
 538
 539 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 540 binop("imod", tint, "",
 541       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 542       "                 src0 % src1 : src0 % src1 + src1)")
 543 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 544 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 545
 546 #
 547 # Comparisons
 548 #
 549
 550
 551 # these integer-aware comparisons return a boolean (0 or ~0)
 552
 553 binop_compare("flt", tfloat, "", "src0 < src1")
 554 binop_compare("fge", tfloat, "", "src0 >= src1")
 555 binop_compare("feq", tfloat, commutative, "src0 == src1")
 556 binop_compare("fne", tfloat, commutative, "src0 != src1")
 557 binop_compare("ilt", tint, "", "src0 < src1")
 558 binop_compare("ige", tint, "", "src0 >= src1")
 559 binop_compare("ieq", tint, commutative, "src0 == src1")
 560 binop_compare("ine", tint, commutative, "src0 != src1")
 561 binop_compare("ult", tuint, "", "src0 < src1")
 562 binop_compare("uge", tuint, "", "src0 >= src1")
 563
 564 # integer-aware GLSL-style comparisons that compare floats and ints
 565
 566 binop_reduce("ball_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 567              "{src0} && {src1}", "{src}")
 568 binop_reduce("bany_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 569              "{src0} || {src1}", "{src}")
 570 binop_reduce("ball_iequal",  1, tbool32, tint, "{src0} == {src1}",
 571              "{src0} && {src1}", "{src}")
 572 binop_reduce("bany_inequal", 1, tbool32, tint, "{src0} != {src1}",
 573              "{src0} || {src1}", "{src}")
 574
 575 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 576
 577 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 578              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 579 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 580              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 581
 582 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 583 # and false respectively
 584
 585 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 586 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 587 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 588 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 589
 590
 591 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
 592 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
 593 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
 594
 595 # bitwise logic operators
 596 #
 597 # These are also used as boolean and, or, xor for hardware supporting
 598 # integers.
 599
 600
 601 binop("iand", tuint, commutative + associative, "src0 & src1")
 602 binop("ior", tuint, commutative + associative, "src0 | src1")
 603 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 604
 605
 606 # floating point logic operators
 607 #
 608 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 609 # for true and 0.0 for false
 610
 611 binop("fand", tfloat32, commutative,
 612       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 613 binop("for", tfloat32, commutative,
 614       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 615 binop("fxor", tfloat32, commutative,
 616       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 617
 618 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 619              "{src}")
 620
 621 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 622              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 623
 624 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 625        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 626 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 627        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 628
 629 binop("fmin", tfloat, "", "fminf(src0, src1)")
 630 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 631 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 632 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 633 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 634 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 635
 636 # Saturated vector add for 4 8bit ints.
 637 binop("usadd_4x8", tint32, commutative + associative, """
 638 dst = 0;
 639 for (int i = 0; i < 32; i += 8) {
 640    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 641 }
 642 """)
 643
 644 # Saturated vector subtract for 4 8bit ints.
 645 binop("ussub_4x8", tint32, "", """
 646 dst = 0;
 647 for (int i = 0; i < 32; i += 8) {
 648    int src0_chan = (src0 >> i) & 0xff;
 649    int src1_chan = (src1 >> i) & 0xff;
 650    if (src0_chan > src1_chan)
 651       dst |= (src0_chan - src1_chan) << i;
 652 }
 653 """)
 654
 655 # vector min for 4 8bit ints.
 656 binop("umin_4x8", tint32, commutative + associative, """
 657 dst = 0;
 658 for (int i = 0; i < 32; i += 8) {
 659    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 660 }
 661 """)
 662
 663 # vector max for 4 8bit ints.
 664 binop("umax_4x8", tint32, commutative + associative, """
 665 dst = 0;
 666 for (int i = 0; i < 32; i += 8) {
 667    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 668 }
 669 """)
 670
 671 # unorm multiply: (a * b) / 255.
 672 binop("umul_unorm_4x8", tint32, commutative + associative, """
 673 dst = 0;
 674 for (int i = 0; i < 32; i += 8) {
 675    int src0_chan = (src0 >> i) & 0xff;
 676    int src1_chan = (src1 >> i) & 0xff;
 677    dst |= ((src0_chan * src1_chan) / 255) << i;
 678 }
 679 """)
 680
 681 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 682
 683 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 684             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 685
 686 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 687               "src0 | ((uint64_t)src1 << 32)")
 688
 689 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 690               "src0 | ((uint32_t)src1 << 16)")
 691
 692 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 693 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 694 # if either of its arguments are 32.
 695 binop_convert("bfm", tuint32, tint32, "", """
 696 int bits = src0, offset = src1;
 697 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 698    dst = 0; /* undefined */
 699 else
 700    dst = ((1u << bits) - 1) << offset;
 701 """)
 702
 703 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
 704 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 705 /* flush denormals to zero. */
 706 if (!isnormal(dst))
 707    dst = copysignf(0.0f, src0);
 708 """)
 709
 710 # Combines the first component of each input to make a 2-component vector.
 711
 712 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 713 dst.x = src0.x;
 714 dst.y = src1.x;
 715 """)
 716
 717 # Byte extraction
 718 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 719 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 720
 721 # Word extraction
 722 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 723 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 724
 725
 726 def triop(name, ty, const_expr):
 727    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 728 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 729    opcode(name, output_size, tuint,
 730    [src1_size, src2_size, src3_size],
 731    [tuint, tuint, tuint], "", const_expr)
 732
 733 triop("ffma", tfloat, "src0 * src1 + src2")
 734
 735 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 736
 737 # Conditional Select
 738 #
 739 # A vector conditional select instruction (like ?:, but operating per-
 740 # component on vectors). There are two versions, one for floating point
 741 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 742
 743
 744 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 745
 746 # 3 way min/max/med
 747 triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
 748 triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
 749 triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")
 750
 751 triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
 752 triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
 753 triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")
 754
 755 triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 756 triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 757 triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 758
 759 opcode("bcsel", 0, tuint, [0, 0, 0],
 760       [tbool32, tuint, tuint], "", "src0 ? src1 : src2")
 761
 762 # SM5 bfi assembly
 763 triop("bfi", tuint32, """
 764 unsigned mask = src0, insert = src1, base = src2;
 765 if (mask == 0) {
 766    dst = base;
 767 } else {
 768    unsigned tmp = mask;
 769    while (!(tmp & 1)) {
 770       tmp >>= 1;
 771       insert <<= 1;
 772    }
 773    dst = (base & ~mask) | (insert & mask);
 774 }
 775 """)
 776
 777 # SM5 ubfe/ibfe assembly
 778 opcode("ubfe", 0, tuint32,
 779        [0, 0, 0], [tuint32, tint32, tint32], "", """
 780 unsigned base = src0;
 781 int offset = src1, bits = src2;
 782 if (bits == 0) {
 783    dst = 0;
 784 } else if (bits < 0 || offset < 0) {
 785    dst = 0; /* undefined */
 786 } else if (offset + bits < 32) {
 787    dst = (base << (32 - bits - offset)) >> (32 - bits);
 788 } else {
 789    dst = base >> offset;
 790 }
 791 """)
 792 opcode("ibfe", 0, tint32,
 793        [0, 0, 0], [tint32, tint32, tint32], "", """
 794 int base = src0;
 795 int offset = src1, bits = src2;
 796 if (bits == 0) {
 797    dst = 0;
 798 } else if (bits < 0 || offset < 0) {
 799    dst = 0; /* undefined */
 800 } else if (offset + bits < 32) {
 801    dst = (base << (32 - bits - offset)) >> (32 - bits);
 802 } else {
 803    dst = base >> offset;
 804 }
 805 """)
 806
 807 # GLSL bitfieldExtract()
 808 opcode("ubitfield_extract", 0, tuint32,
 809        [0, 0, 0], [tuint32, tint32, tint32], "", """
 810 unsigned base = src0;
 811 int offset = src1, bits = src2;
 812 if (bits == 0) {
 813    dst = 0;
 814 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 815    dst = 0; /* undefined per the spec */
 816 } else {
 817    dst = (base >> offset) & ((1ull << bits) - 1);
 818 }
 819 """)
 820 opcode("ibitfield_extract", 0, tint32,
 821        [0, 0, 0], [tint32, tint32, tint32], "", """
 822 int base = src0;
 823 int offset = src1, bits = src2;
 824 if (bits == 0) {
 825    dst = 0;
 826 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 827    dst = 0;
 828 } else {
 829    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 830 }
 831 """)
 832
 833 # Combines the first component of each input to make a 3-component vector.
 834
 835 triop_horiz("vec3", 3, 1, 1, 1, """
 836 dst.x = src0.x;
 837 dst.y = src1.x;
 838 dst.z = src2.x;
 839 """)
 840
 841 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 842                  src4_size, const_expr):
 843    opcode(name, output_size, tuint,
 844           [src1_size, src2_size, src3_size, src4_size],
 845           [tuint, tuint, tuint, tuint],
 846           "", const_expr)
 847
 848 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 849        [tuint32, tuint32, tint32, tint32], "", """
 850 unsigned base = src0, insert = src1;
 851 int offset = src2, bits = src3;
 852 if (bits == 0) {
 853    dst = base;
 854 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 855    dst = 0;
 856 } else {
 857    unsigned mask = ((1ull << bits) - 1) << offset;
 858    dst = (base & ~mask) | ((insert << offset) & mask);
 859 }
 860 """)
 861
 862 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 863 dst.x = src0.x;
 864 dst.y = src1.x;
 865 dst.z = src2.x;
 866 dst.w = src3.x;
 867 """)
 868
 869