src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool"
  94 tbool32 = "bool32"
  95 tuint = "uint"
  96 tuint16 = "uint16"
  97 tfloat32 = "float32"
  98 tint32 = "int32"
  99 tuint32 = "uint32"
 100 tint64 = "int64"
 101 tuint64 = "uint64"
 102 tfloat64 = "float64"
 103
 104 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 105
 106 def type_has_size(type_):
 107     m = _TYPE_SPLIT_RE.match(type_)
 108     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 109     return m.group('bits') is not None
 110
 111 def type_size(type_):
 112     m = _TYPE_SPLIT_RE.match(type_)
 113     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 114     assert m.group('bits') is not None, \
 115            'NIR type string has no bit size: "{}"'.format(type_)
 116     return int(m.group('bits'))
 117
 118 def type_sizes(type_):
 119     if type_has_size(type_):
 120         return [type_size(type_)]
 121     elif type_ == 'bool':
 122         return [32]
 123     elif type_ == 'float':
 124         return [16, 32, 64]
 125     else:
 126         return [8, 16, 32, 64]
 127
 128 def type_base_type(type_):
 129     m = _TYPE_SPLIT_RE.match(type_)
 130     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 131     return m.group('type')
 132
 133 commutative = "commutative "
 134 associative = "associative "
 135
 136 # global dictionary of opcodes
 137 opcodes = {}
 138
 139 def opcode(name, output_size, output_type, input_sizes, input_types,
 140            algebraic_properties, const_expr):
 141    assert name not in opcodes
 142    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 143                           input_types, algebraic_properties, const_expr)
 144
 145 def unop_convert(name, out_type, in_type, const_expr):
 146    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 147
 148 def unop(name, ty, const_expr):
 149    opcode(name, 0, ty, [0], [ty], "", const_expr)
 150
 151 def unop_horiz(name, output_size, output_type, input_size, input_type,
 152                const_expr):
 153    opcode(name, output_size, output_type, [input_size], [input_type], "",
 154           const_expr)
 155
 156 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 157                 reduce_expr, final_expr):
 158    def prereduce(src):
 159       return "(" + prereduce_expr.format(src=src) + ")"
 160    def final(src):
 161       return final_expr.format(src="(" + src + ")")
 162    def reduce_(src0, src1):
 163       return reduce_expr.format(src0=src0, src1=src1)
 164    src0 = prereduce("src0.x")
 165    src1 = prereduce("src0.y")
 166    src2 = prereduce("src0.z")
 167    src3 = prereduce("src0.w")
 168    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 169               final(reduce_(src0, src1)))
 170    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 171               final(reduce_(reduce_(src0, src1), src2)))
 172    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 173               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 174
 175
 176 # These two move instructions differ in what modifiers they support and what
 177 # the negate modifier means. Otherwise, they are identical.
 178 unop("fmov", tfloat, "src0")
 179 unop("imov", tint, "src0")
 180
 181 unop("ineg", tint, "-src0")
 182 unop("fneg", tfloat, "-src0")
 183 unop("inot", tint, "~src0") # invert every bit of the integer
 184 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
 185                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
 186 unop("fsign", tfloat, ("bit_size == 64 ? " +
 187                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 188                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 189 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 190 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 191 unop("fabs", tfloat, "fabs(src0)")
 192 unop("fsat", tfloat, ("bit_size == 64 ? " +
 193                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 194                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 195 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 196 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 197 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 198 unop("fexp2", tfloat, "exp2f(src0)")
 199 unop("flog2", tfloat, "log2f(src0)")
 200
 201 # Generate all of the numeric conversion opcodes
 202 for src_t in [tint, tuint, tfloat, tbool]:
 203    if src_t == tbool:
 204       dst_types = [tfloat, tint]
 205    elif src_t == tint:
 206       dst_types = [tfloat, tint, tbool]
 207    elif src_t == tuint:
 208       dst_types = [tfloat, tuint]
 209    elif src_t == tfloat:
 210       dst_types = [tint, tuint, tfloat, tbool]
 211
 212    for dst_t in dst_types:
 213       for bit_size in type_sizes(dst_t):
 214           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 215               rnd_modes = ['_rtne', '_rtz', '']
 216               for rnd_mode in rnd_modes:
 217                   unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
 218                                                        bit_size, rnd_mode),
 219                                dst_t + str(bit_size), src_t, "src0")
 220           else:
 221               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 222               unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 223                            dst_t + str(bit_size), src_t, conv_expr)
 224
 225
 226 # Unary floating-point rounding operations.
 227
 228
 229 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 230 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 231 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 232 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 233 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 234
 235 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 236
 237 # Trigonometric operations.
 238
 239
 240 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 241 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 242
 243 # dfrexp
 244 unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);")
 245 unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);")
 246
 247 # Partial derivatives.
 248
 249
 250 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 251 unop("fddy", tfloat, "0.0")
 252 unop("fddx_fine", tfloat, "0.0")
 253 unop("fddy_fine", tfloat, "0.0")
 254 unop("fddx_coarse", tfloat, "0.0")
 255 unop("fddy_coarse", tfloat, "0.0")
 256
 257
 258 # Floating point pack and unpack operations.
 259
 260 def pack_2x16(fmt):
 261    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 262 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 263 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 264 """.replace("fmt", fmt))
 265
 266 def pack_4x8(fmt):
 267    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 268 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 269 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 270 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 271 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 272 """.replace("fmt", fmt))
 273
 274 def unpack_2x16(fmt):
 275    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 276 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 277 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 278 """.replace("fmt", fmt))
 279
 280 def unpack_4x8(fmt):
 281    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 282 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 283 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 284 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 285 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 286 """.replace("fmt", fmt))
 287
 288
 289 pack_2x16("snorm")
 290 pack_4x8("snorm")
 291 pack_2x16("unorm")
 292 pack_4x8("unorm")
 293 pack_2x16("half")
 294 unpack_2x16("snorm")
 295 unpack_4x8("snorm")
 296 unpack_2x16("unorm")
 297 unpack_4x8("unorm")
 298 unpack_2x16("half")
 299
 300 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 301 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 302 """)
 303
 304 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 305 dst.x = (src0.x <<  0) |
 306         (src0.y <<  8) |
 307         (src0.z << 16) |
 308         (src0.w << 24);
 309 """)
 310
 311 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 312            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 313
 314 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 315            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 316
 317 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 318            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 319
 320 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 321            "dst.x = src0.x; dst.y = src0.x >> 32;")
 322
 323 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 324            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 325
 326 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 327            "dst.x = src0.x; dst.y = src0.x >> 16;")
 328
 329 # Lowered floating point unpacking operations.
 330
 331
 332 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 333              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 334 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 335              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 336
 337 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 338 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 339
 340 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 341 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 342
 343 # Bit operations, part of ARB_gpu_shader5.
 344
 345
 346 unop("bitfield_reverse", tuint32, """
 347 /* we're not winning any awards for speed here, but that's ok */
 348 dst = 0;
 349 for (unsigned bit = 0; bit < 32; bit++)
 350    dst |= ((src0 >> bit) & 1) << (31 - bit);
 351 """)
 352 unop_convert("bit_count", tuint32, tuint, """
 353 dst = 0;
 354 for (unsigned bit = 0; bit < bit_size; bit++) {
 355    if ((src0 >> bit) & 1)
 356       dst++;
 357 }
 358 """)
 359
 360 unop_convert("ufind_msb", tint32, tuint, """
 361 dst = -1;
 362 for (int bit = bit_size - 1; bit >= 0; bit--) {
 363    if ((src0 >> bit) & 1) {
 364       dst = bit;
 365       break;
 366    }
 367 }
 368 """)
 369
 370 unop("ifind_msb", tint32, """
 371 dst = -1;
 372 for (int bit = 31; bit >= 0; bit--) {
 373    /* If src0 < 0, we're looking for the first 0 bit.
 374     * if src0 >= 0, we're looking for the first 1 bit.
 375     */
 376    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 377       (!((src0 >> bit) & 1) && (src0 < 0))) {
 378       dst = bit;
 379       break;
 380    }
 381 }
 382 """)
 383
 384 unop_convert("find_lsb", tint32, tint, """
 385 dst = -1;
 386 for (unsigned bit = 0; bit < bit_size; bit++) {
 387    if ((src0 >> bit) & 1) {
 388       dst = bit;
 389       break;
 390    }
 391 }
 392 """)
 393
 394
 395 for i in range(1, 5):
 396    for j in range(1, 5):
 397       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 398
 399
 400 # AMD_gcn_shader extended instructions
 401 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 402 dst.x = dst.y = 0.0;
 403 float absX = fabs(src0.x);
 404 float absY = fabs(src0.y);
 405 float absZ = fabs(src0.z);
 406 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
 407 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
 408 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
 409 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
 410 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
 411 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
 412 """)
 413
 414 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 415 float absX = fabs(src0.x);
 416 float absY = fabs(src0.y);
 417 float absZ = fabs(src0.z);
 418 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 419 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 420 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 421 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 422 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 423 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 424 """)
 425
 426
 427 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 428    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 429
 430 def binop(name, ty, alg_props, const_expr):
 431    binop_convert(name, ty, ty, alg_props, const_expr)
 432
 433 def binop_compare(name, ty, alg_props, const_expr):
 434    binop_convert(name, tbool32, ty, alg_props, const_expr)
 435
 436 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 437                 src2_type, const_expr):
 438    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 439           "", const_expr)
 440
 441 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 442                  reduce_expr, final_expr):
 443    def final(src):
 444       return final_expr.format(src= "(" + src + ")")
 445    def reduce_(src0, src1):
 446       return reduce_expr.format(src0=src0, src1=src1)
 447    def prereduce(src0, src1):
 448       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 449    src0 = prereduce("src0.x", "src1.x")
 450    src1 = prereduce("src0.y", "src1.y")
 451    src2 = prereduce("src0.z", "src1.z")
 452    src3 = prereduce("src0.w", "src1.w")
 453    opcode(name + "2", output_size, output_type,
 454           [2, 2], [src_type, src_type], commutative,
 455           final(reduce_(src0, src1)))
 456    opcode(name + "3", output_size, output_type,
 457           [3, 3], [src_type, src_type], commutative,
 458           final(reduce_(reduce_(src0, src1), src2)))
 459    opcode(name + "4", output_size, output_type,
 460           [4, 4], [src_type, src_type], commutative,
 461           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 462
 463 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 464 binop("iadd", tint, commutative + associative, "src0 + src1")
 465 binop("fsub", tfloat, "", "src0 - src1")
 466 binop("isub", tint, "", "src0 - src1")
 467
 468 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 469 # low 32-bits of signed/unsigned integer multiply
 470 binop("imul", tint, commutative + associative, "src0 * src1")
 471
 472 # high 32-bits of signed integer multiply
 473 binop("imul_high", tint, commutative, """
 474 if (bit_size == 64) {
 475    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 476     * extension to work properly.  The casts are kind-of annoying but needed
 477     * to prevent compiler warnings.
 478     */
 479    uint32_t src0_u32[4] = {
 480       src0,
 481       (int64_t)src0 >> 32,
 482       (int64_t)src0 >> 63,
 483       (int64_t)src0 >> 63,
 484    };
 485    uint32_t src1_u32[4] = {
 486       src1,
 487       (int64_t)src1 >> 32,
 488       (int64_t)src1 >> 63,
 489       (int64_t)src1 >> 63,
 490    };
 491    uint32_t prod_u32[4];
 492    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 493    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 494 } else {
 495    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 496 }
 497 """)
 498
 499 # high 32-bits of unsigned integer multiply
 500 binop("umul_high", tuint, commutative, """
 501 if (bit_size == 64) {
 502    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 503    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 504    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 505    uint32_t prod_u32[4];
 506    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 507    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 508 } else {
 509    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 510 }
 511 """)
 512
 513 binop("fdiv", tfloat, "", "src0 / src1")
 514 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 515 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 516
 517 # returns a boolean representing the carry resulting from the addition of
 518 # the two unsigned arguments.
 519
 520 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 521
 522 # returns a boolean representing the borrow resulting from the subtraction
 523 # of the two unsigned arguments.
 524
 525 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 526
 527 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 528
 529 # For signed integers, there are several different possible definitions of
 530 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 531 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 532 # operation while the imod opcode implements the more mathematical
 533 # "modulus" operation.  For details on the difference, see
 534 #
 535 # http://mathforum.org/library/drmath/view/52343.html
 536
 537 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 538 binop("imod", tint, "",
 539       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 540       "                 src0 % src1 : src0 % src1 + src1)")
 541 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 542 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 543
 544 #
 545 # Comparisons
 546 #
 547
 548
 549 # these integer-aware comparisons return a boolean (0 or ~0)
 550
 551 binop_compare("flt", tfloat, "", "src0 < src1")
 552 binop_compare("fge", tfloat, "", "src0 >= src1")
 553 binop_compare("feq", tfloat, commutative, "src0 == src1")
 554 binop_compare("fne", tfloat, commutative, "src0 != src1")
 555 binop_compare("ilt", tint, "", "src0 < src1")
 556 binop_compare("ige", tint, "", "src0 >= src1")
 557 binop_compare("ieq", tint, commutative, "src0 == src1")
 558 binop_compare("ine", tint, commutative, "src0 != src1")
 559 binop_compare("ult", tuint, "", "src0 < src1")
 560 binop_compare("uge", tuint, "", "src0 >= src1")
 561
 562 # integer-aware GLSL-style comparisons that compare floats and ints
 563
 564 binop_reduce("ball_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 565              "{src0} && {src1}", "{src}")
 566 binop_reduce("bany_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 567              "{src0} || {src1}", "{src}")
 568 binop_reduce("ball_iequal",  1, tbool32, tint, "{src0} == {src1}",
 569              "{src0} && {src1}", "{src}")
 570 binop_reduce("bany_inequal", 1, tbool32, tint, "{src0} != {src1}",
 571              "{src0} || {src1}", "{src}")
 572
 573 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 574
 575 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 576              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 577 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 578              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 579
 580 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 581 # and false respectively
 582
 583 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 584 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 585 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 586 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 587
 588
 589 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
 590 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
 591 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
 592
 593 # bitwise logic operators
 594 #
 595 # These are also used as boolean and, or, xor for hardware supporting
 596 # integers.
 597
 598
 599 binop("iand", tuint, commutative + associative, "src0 & src1")
 600 binop("ior", tuint, commutative + associative, "src0 | src1")
 601 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 602
 603
 604 # floating point logic operators
 605 #
 606 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 607 # for true and 0.0 for false
 608
 609 binop("fand", tfloat32, commutative,
 610       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 611 binop("for", tfloat32, commutative,
 612       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 613 binop("fxor", tfloat32, commutative,
 614       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 615
 616 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 617              "{src}")
 618
 619 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 620              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 621
 622 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 623        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 624 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 625        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 626
 627 binop("fmin", tfloat, "", "fminf(src0, src1)")
 628 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 629 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 630 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 631 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 632 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 633
 634 # Saturated vector add for 4 8bit ints.
 635 binop("usadd_4x8", tint32, commutative + associative, """
 636 dst = 0;
 637 for (int i = 0; i < 32; i += 8) {
 638    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 639 }
 640 """)
 641
 642 # Saturated vector subtract for 4 8bit ints.
 643 binop("ussub_4x8", tint32, "", """
 644 dst = 0;
 645 for (int i = 0; i < 32; i += 8) {
 646    int src0_chan = (src0 >> i) & 0xff;
 647    int src1_chan = (src1 >> i) & 0xff;
 648    if (src0_chan > src1_chan)
 649       dst |= (src0_chan - src1_chan) << i;
 650 }
 651 """)
 652
 653 # vector min for 4 8bit ints.
 654 binop("umin_4x8", tint32, commutative + associative, """
 655 dst = 0;
 656 for (int i = 0; i < 32; i += 8) {
 657    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 658 }
 659 """)
 660
 661 # vector max for 4 8bit ints.
 662 binop("umax_4x8", tint32, commutative + associative, """
 663 dst = 0;
 664 for (int i = 0; i < 32; i += 8) {
 665    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 666 }
 667 """)
 668
 669 # unorm multiply: (a * b) / 255.
 670 binop("umul_unorm_4x8", tint32, commutative + associative, """
 671 dst = 0;
 672 for (int i = 0; i < 32; i += 8) {
 673    int src0_chan = (src0 >> i) & 0xff;
 674    int src1_chan = (src1 >> i) & 0xff;
 675    dst |= ((src0_chan * src1_chan) / 255) << i;
 676 }
 677 """)
 678
 679 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 680
 681 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 682             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 683
 684 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 685               "src0 | ((uint64_t)src1 << 32)")
 686
 687 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 688               "src0 | ((uint32_t)src1 << 16)")
 689
 690 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 691 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 692 # if either of its arguments are 32.
 693 binop_convert("bfm", tuint32, tint32, "", """
 694 int bits = src0, offset = src1;
 695 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 696    dst = 0; /* undefined */
 697 else
 698    dst = ((1u << bits) - 1) << offset;
 699 """)
 700
 701 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
 702 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 703 /* flush denormals to zero. */
 704 if (!isnormal(dst))
 705    dst = copysignf(0.0f, src0);
 706 """)
 707
 708 # Combines the first component of each input to make a 2-component vector.
 709
 710 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 711 dst.x = src0.x;
 712 dst.y = src1.x;
 713 """)
 714
 715 # Byte extraction
 716 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 717 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 718
 719 # Word extraction
 720 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 721 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 722
 723
 724 def triop(name, ty, const_expr):
 725    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 726 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 727    opcode(name, output_size, tuint,
 728    [src1_size, src2_size, src3_size],
 729    [tuint, tuint, tuint], "", const_expr)
 730
 731 triop("ffma", tfloat, "src0 * src1 + src2")
 732
 733 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 734
 735 # Conditional Select
 736 #
 737 # A vector conditional select instruction (like ?:, but operating per-
 738 # component on vectors). There are two versions, one for floating point
 739 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 740
 741
 742 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 743
 744 # 3 way min/max/med
 745 triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
 746 triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
 747 triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")
 748
 749 triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
 750 triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
 751 triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")
 752
 753 triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 754 triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 755 triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 756
 757 opcode("bcsel", 0, tuint, [0, 0, 0],
 758       [tbool32, tuint, tuint], "", "src0 ? src1 : src2")
 759
 760 # SM5 bfi assembly
 761 triop("bfi", tuint32, """
 762 unsigned mask = src0, insert = src1, base = src2;
 763 if (mask == 0) {
 764    dst = base;
 765 } else {
 766    unsigned tmp = mask;
 767    while (!(tmp & 1)) {
 768       tmp >>= 1;
 769       insert <<= 1;
 770    }
 771    dst = (base & ~mask) | (insert & mask);
 772 }
 773 """)
 774
 775 # SM5 ubfe/ibfe assembly
 776 opcode("ubfe", 0, tuint32,
 777        [0, 0, 0], [tuint32, tint32, tint32], "", """
 778 unsigned base = src0;
 779 int offset = src1, bits = src2;
 780 if (bits == 0) {
 781    dst = 0;
 782 } else if (bits < 0 || offset < 0) {
 783    dst = 0; /* undefined */
 784 } else if (offset + bits < 32) {
 785    dst = (base << (32 - bits - offset)) >> (32 - bits);
 786 } else {
 787    dst = base >> offset;
 788 }
 789 """)
 790 opcode("ibfe", 0, tint32,
 791        [0, 0, 0], [tint32, tint32, tint32], "", """
 792 int base = src0;
 793 int offset = src1, bits = src2;
 794 if (bits == 0) {
 795    dst = 0;
 796 } else if (bits < 0 || offset < 0) {
 797    dst = 0; /* undefined */
 798 } else if (offset + bits < 32) {
 799    dst = (base << (32 - bits - offset)) >> (32 - bits);
 800 } else {
 801    dst = base >> offset;
 802 }
 803 """)
 804
 805 # GLSL bitfieldExtract()
 806 opcode("ubitfield_extract", 0, tuint32,
 807        [0, 0, 0], [tuint32, tint32, tint32], "", """
 808 unsigned base = src0;
 809 int offset = src1, bits = src2;
 810 if (bits == 0) {
 811    dst = 0;
 812 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 813    dst = 0; /* undefined per the spec */
 814 } else {
 815    dst = (base >> offset) & ((1ull << bits) - 1);
 816 }
 817 """)
 818 opcode("ibitfield_extract", 0, tint32,
 819        [0, 0, 0], [tint32, tint32, tint32], "", """
 820 int base = src0;
 821 int offset = src1, bits = src2;
 822 if (bits == 0) {
 823    dst = 0;
 824 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 825    dst = 0;
 826 } else {
 827    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 828 }
 829 """)
 830
 831 # Combines the first component of each input to make a 3-component vector.
 832
 833 triop_horiz("vec3", 3, 1, 1, 1, """
 834 dst.x = src0.x;
 835 dst.y = src1.x;
 836 dst.z = src2.x;
 837 """)
 838
 839 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 840                  src4_size, const_expr):
 841    opcode(name, output_size, tuint,
 842           [src1_size, src2_size, src3_size, src4_size],
 843           [tuint, tuint, tuint, tuint],
 844           "", const_expr)
 845
 846 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 847        [tuint32, tuint32, tint32, tint32], "", """
 848 unsigned base = src0, insert = src1;
 849 int offset = src2, bits = src3;
 850 if (bits == 0) {
 851    dst = base;
 852 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 853    dst = 0;
 854 } else {
 855    unsigned mask = ((1ull << bits) - 1) << offset;
 856    dst = (base & ~mask) | ((insert << offset) & mask);
 857 }
 858 """)
 859
 860 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 861 dst.x = src0.x;
 862 dst.y = src1.x;
 863 dst.z = src2.x;
 864 dst.w = src3.x;
 865 """)
 866
 867