src/compiler/nir/nir_opcodes.py

   1 #! /usr/bin/env python
   2 #
   3 # Copyright (C) 2014 Connor Abbott
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a
   6 # copy of this software and associated documentation files (the "Software"),
   7 # to deal in the Software without restriction, including without limitation
   8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 # and/or sell copies of the Software, and to permit persons to whom the
  10 # Software is furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice (including the next
  13 # paragraph) shall be included in all copies or substantial portions of the
  14 # Software.
  15 #
  16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22 # IN THE SOFTWARE.
  23 #
  24 # Authors:
  25 #    Connor Abbott (cwabbott0@gmail.com)
  26
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool32"
  94 tuint = "uint"
  95 tfloat32 = "float32"
  96 tint32 = "int32"
  97 tuint32 = "uint32"
  98 tuint64 = "uint64"
  99 tfloat64 = "float64"
 100
 101 commutative = "commutative "
 102 associative = "associative "
 103
 104 # global dictionary of opcodes
 105 opcodes = {}
 106
 107 def opcode(name, output_size, output_type, input_sizes, input_types,
 108            algebraic_properties, const_expr):
 109    assert name not in opcodes
 110    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 111                           input_types, algebraic_properties, const_expr)
 112
 113 def unop_convert(name, out_type, in_type, const_expr):
 114    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 115
 116 def unop(name, ty, const_expr):
 117    opcode(name, 0, ty, [0], [ty], "", const_expr)
 118
 119 def unop_horiz(name, output_size, output_type, input_size, input_type,
 120                const_expr):
 121    opcode(name, output_size, output_type, [input_size], [input_type], "",
 122           const_expr)
 123
 124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 125                 reduce_expr, final_expr):
 126    def prereduce(src):
 127       return "(" + prereduce_expr.format(src=src) + ")"
 128    def final(src):
 129       return final_expr.format(src="(" + src + ")")
 130    def reduce_(src0, src1):
 131       return reduce_expr.format(src0=src0, src1=src1)
 132    src0 = prereduce("src0.x")
 133    src1 = prereduce("src0.y")
 134    src2 = prereduce("src0.z")
 135    src3 = prereduce("src0.w")
 136    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 137               final(reduce_(src0, src1)))
 138    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 139               final(reduce_(reduce_(src0, src1), src2)))
 140    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 141               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 142
 143
 144 # These two move instructions differ in what modifiers they support and what
 145 # the negate modifier means. Otherwise, they are identical.
 146 unop("fmov", tfloat, "src0")
 147 unop("imov", tint, "src0")
 148
 149 unop("ineg", tint, "-src0")
 150 unop("fneg", tfloat, "-src0")
 151 unop("inot", tint, "~src0") # invert every bit of the integer
 152 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
 153                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
 154 unop("fsign", tfloat, ("bit_size == 64 ? " +
 155                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 156                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 157 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 158 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 159 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
 160 unop("fsat", tfloat, ("bit_size == 64 ? " +
 161                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 162                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 163 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 164 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 165 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 166 unop("fexp2", tfloat, "exp2f(src0)")
 167 unop("flog2", tfloat, "log2f(src0)")
 168 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
 169 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
 170 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
 171 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
 172 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 173 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
 174 # Float-to-boolean conversion
 175 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 176 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
 177 # Boolean-to-float conversion
 178 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 179 # Int-to-boolean conversion
 180 unop_convert("i2b", tbool, tint32, "src0 != 0")
 181 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
 182 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 183 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
 184 # double-to-float conversion
 185 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
 186 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
 187
 188 # Unary floating-point rounding operations.
 189
 190
 191 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 192 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 193 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 194 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 195 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 196
 197 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 198
 199 # Trigonometric operations.
 200
 201
 202 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 203 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 204
 205
 206 # Partial derivatives.
 207
 208
 209 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 210 unop("fddy", tfloat, "0.0")
 211 unop("fddx_fine", tfloat, "0.0")
 212 unop("fddy_fine", tfloat, "0.0")
 213 unop("fddx_coarse", tfloat, "0.0")
 214 unop("fddy_coarse", tfloat, "0.0")
 215
 216
 217 # Floating point pack and unpack operations.
 218
 219 def pack_2x16(fmt):
 220    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 221 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 222 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 223 """.replace("fmt", fmt))
 224
 225 def pack_4x8(fmt):
 226    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 227 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 228 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 229 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 230 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 231 """.replace("fmt", fmt))
 232
 233 def unpack_2x16(fmt):
 234    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 235 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 236 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 237 """.replace("fmt", fmt))
 238
 239 def unpack_4x8(fmt):
 240    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 241 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 242 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 243 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 244 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 245 """.replace("fmt", fmt))
 246
 247
 248 pack_2x16("snorm")
 249 pack_4x8("snorm")
 250 pack_2x16("unorm")
 251 pack_4x8("unorm")
 252 pack_2x16("half")
 253 unpack_2x16("snorm")
 254 unpack_4x8("snorm")
 255 unpack_2x16("unorm")
 256 unpack_4x8("unorm")
 257 unpack_2x16("half")
 258
 259 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 260 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 261 """)
 262
 263 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 264 dst.x = (src0.x <<  0) |
 265         (src0.y <<  8) |
 266         (src0.z << 16) |
 267         (src0.w << 24);
 268 """)
 269
 270 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
 271            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 272
 273 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
 274            "dst.x = src0.x; dst.y = src0.x >> 32;")
 275
 276 # Lowered floating point unpacking operations.
 277
 278
 279 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
 280            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
 281 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
 282            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 283
 284 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
 285 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 286
 287 # Bit operations, part of ARB_gpu_shader5.
 288
 289
 290 unop("bitfield_reverse", tuint32, """
 291 /* we're not winning any awards for speed here, but that's ok */
 292 dst = 0;
 293 for (unsigned bit = 0; bit < 32; bit++)
 294    dst |= ((src0 >> bit) & 1) << (31 - bit);
 295 """)
 296 unop("bit_count", tuint32, """
 297 dst = 0;
 298 for (unsigned bit = 0; bit < 32; bit++) {
 299    if ((src0 >> bit) & 1)
 300       dst++;
 301 }
 302 """)
 303
 304 unop_convert("ufind_msb", tint32, tuint32, """
 305 dst = -1;
 306 for (int bit = 31; bit > 0; bit--) {
 307    if ((src0 >> bit) & 1) {
 308       dst = bit;
 309       break;
 310    }
 311 }
 312 """)
 313
 314 unop("ifind_msb", tint32, """
 315 dst = -1;
 316 for (int bit = 31; bit >= 0; bit--) {
 317    /* If src0 < 0, we're looking for the first 0 bit.
 318     * if src0 >= 0, we're looking for the first 1 bit.
 319     */
 320    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 321       (!((src0 >> bit) & 1) && (src0 < 0))) {
 322       dst = bit;
 323       break;
 324    }
 325 }
 326 """)
 327
 328 unop("find_lsb", tint32, """
 329 dst = -1;
 330 for (unsigned bit = 0; bit < 32; bit++) {
 331    if ((src0 >> bit) & 1) {
 332       dst = bit;
 333       break;
 334    }
 335 }
 336 """)
 337
 338
 339 for i in xrange(1, 5):
 340    for j in xrange(1, 5):
 341       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 342
 343 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 344    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 345
 346 def binop(name, ty, alg_props, const_expr):
 347    binop_convert(name, ty, ty, alg_props, const_expr)
 348
 349 def binop_compare(name, ty, alg_props, const_expr):
 350    binop_convert(name, tbool, ty, alg_props, const_expr)
 351
 352 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 353                 src2_type, const_expr):
 354    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 355           "", const_expr)
 356
 357 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 358                  reduce_expr, final_expr):
 359    def final(src):
 360       return final_expr.format(src= "(" + src + ")")
 361    def reduce_(src0, src1):
 362       return reduce_expr.format(src0=src0, src1=src1)
 363    def prereduce(src0, src1):
 364       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 365    src0 = prereduce("src0.x", "src1.x")
 366    src1 = prereduce("src0.y", "src1.y")
 367    src2 = prereduce("src0.z", "src1.z")
 368    src3 = prereduce("src0.w", "src1.w")
 369    opcode(name + "2", output_size, output_type,
 370           [2, 2], [src_type, src_type], commutative,
 371           final(reduce_(src0, src1)))
 372    opcode(name + "3", output_size, output_type,
 373           [3, 3], [src_type, src_type], commutative,
 374           final(reduce_(reduce_(src0, src1), src2)))
 375    opcode(name + "4", output_size, output_type,
 376           [4, 4], [src_type, src_type], commutative,
 377           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 378
 379 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 380 binop("iadd", tint, commutative + associative, "src0 + src1")
 381 binop("fsub", tfloat, "", "src0 - src1")
 382 binop("isub", tint, "", "src0 - src1")
 383
 384 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 385 # low 32-bits of signed/unsigned integer multiply
 386 binop("imul", tint, commutative + associative, "src0 * src1")
 387 # high 32-bits of signed integer multiply
 388 binop("imul_high", tint32, commutative,
 389       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 390 # high 32-bits of unsigned integer multiply
 391 binop("umul_high", tuint32, commutative,
 392       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 393
 394 binop("fdiv", tfloat, "", "src0 / src1")
 395 binop("idiv", tint, "", "src0 / src1")
 396 binop("udiv", tuint, "", "src0 / src1")
 397
 398 # returns a boolean representing the carry resulting from the addition of
 399 # the two unsigned arguments.
 400
 401 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 402
 403 # returns a boolean representing the borrow resulting from the subtraction
 404 # of the two unsigned arguments.
 405
 406 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 407
 408 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 409
 410 # For signed integers, there are several different possible definitions of
 411 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 412 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 413 # operation while the imod opcode implements the more mathematical
 414 # "modulus" operation.  For details on the difference, see
 415 #
 416 # http://mathforum.org/library/drmath/view/52343.html
 417
 418 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 419 binop("imod", tint, "",
 420       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 421       "                 src0 % src1 : src0 % src1 + src1)")
 422 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 423 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 424
 425 #
 426 # Comparisons
 427 #
 428
 429
 430 # these integer-aware comparisons return a boolean (0 or ~0)
 431
 432 binop_compare("flt", tfloat, "", "src0 < src1")
 433 binop_compare("fge", tfloat, "", "src0 >= src1")
 434 binop_compare("feq", tfloat, commutative, "src0 == src1")
 435 binop_compare("fne", tfloat, commutative, "src0 != src1")
 436 binop_compare("ilt", tint, "", "src0 < src1")
 437 binop_compare("ige", tint, "", "src0 >= src1")
 438 binop_compare("ieq", tint, commutative, "src0 == src1")
 439 binop_compare("ine", tint, commutative, "src0 != src1")
 440 binop_compare("ult", tuint, "", "src0 < src1")
 441 binop_compare("uge", tuint, "", "src0 >= src1")
 442
 443 # integer-aware GLSL-style comparisons that compare floats and ints
 444
 445 binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
 446              "{src0} && {src1}", "{src}")
 447 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
 448              "{src0} || {src1}", "{src}")
 449 binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
 450              "{src0} && {src1}", "{src}")
 451 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 452              "{src0} || {src1}", "{src}")
 453
 454 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 455
 456 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 457              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 458 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 459              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 460
 461 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 462 # and false respectively
 463
 464 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 465 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 466 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 467 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 468
 469
 470 binop("ishl", tint, "", "src0 << src1")
 471 binop("ishr", tint, "", "src0 >> src1")
 472 binop("ushr", tuint, "", "src0 >> src1")
 473
 474 # bitwise logic operators
 475 #
 476 # These are also used as boolean and, or, xor for hardware supporting
 477 # integers.
 478
 479
 480 binop("iand", tuint, commutative + associative, "src0 & src1")
 481 binop("ior", tuint, commutative + associative, "src0 | src1")
 482 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 483
 484
 485 # floating point logic operators
 486 #
 487 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 488 # for true and 0.0 for false
 489
 490 binop("fand", tfloat32, commutative,
 491       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 492 binop("for", tfloat32, commutative,
 493       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 494 binop("fxor", tfloat32, commutative,
 495       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 496
 497 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 498              "{src}")
 499
 500 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 501              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 502
 503 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 504        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 505 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 506        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 507
 508 binop("fmin", tfloat, "", "fminf(src0, src1)")
 509 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 510 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 511 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 512 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 513 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 514
 515 # Saturated vector add for 4 8bit ints.
 516 binop("usadd_4x8", tint32, commutative + associative, """
 517 dst = 0;
 518 for (int i = 0; i < 32; i += 8) {
 519    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 520 }
 521 """)
 522
 523 # Saturated vector subtract for 4 8bit ints.
 524 binop("ussub_4x8", tint32, "", """
 525 dst = 0;
 526 for (int i = 0; i < 32; i += 8) {
 527    int src0_chan = (src0 >> i) & 0xff;
 528    int src1_chan = (src1 >> i) & 0xff;
 529    if (src0_chan > src1_chan)
 530       dst |= (src0_chan - src1_chan) << i;
 531 }
 532 """)
 533
 534 # vector min for 4 8bit ints.
 535 binop("umin_4x8", tint32, commutative + associative, """
 536 dst = 0;
 537 for (int i = 0; i < 32; i += 8) {
 538    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 539 }
 540 """)
 541
 542 # vector max for 4 8bit ints.
 543 binop("umax_4x8", tint32, commutative + associative, """
 544 dst = 0;
 545 for (int i = 0; i < 32; i += 8) {
 546    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 547 }
 548 """)
 549
 550 # unorm multiply: (a * b) / 255.
 551 binop("umul_unorm_4x8", tint32, commutative + associative, """
 552 dst = 0;
 553 for (int i = 0; i < 32; i += 8) {
 554    int src0_chan = (src0 >> i) & 0xff;
 555    int src1_chan = (src1 >> i) & 0xff;
 556    dst |= ((src0_chan * src1_chan) / 255) << i;
 557 }
 558 """)
 559
 560 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 561
 562 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 563             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 564
 565 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
 566               "src0 | ((uint64_t)src1 << 32)")
 567
 568 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 569 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 570 # if either of its arguments are 32.
 571 binop_convert("bfm", tuint32, tint32, "", """
 572 int bits = src0, offset = src1;
 573 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 574    dst = 0; /* undefined */
 575 else
 576    dst = ((1u << bits) - 1) << offset;
 577 """)
 578
 579 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
 580 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 581 /* flush denormals to zero. */
 582 if (!isnormal(dst))
 583    dst = copysignf(0.0f, src0);
 584 """)
 585
 586 # Combines the first component of each input to make a 2-component vector.
 587
 588 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 589 dst.x = src0.x;
 590 dst.y = src1.x;
 591 """)
 592
 593 # Byte extraction
 594 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 595 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 596
 597 # Word extraction
 598 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 599 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 600
 601
 602 def triop(name, ty, const_expr):
 603    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 604 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 605    opcode(name, output_size, tuint,
 606    [src1_size, src2_size, src3_size],
 607    [tuint, tuint, tuint], "", const_expr)
 608
 609 triop("ffma", tfloat, "src0 * src1 + src2")
 610
 611 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 612
 613 # Conditional Select
 614 #
 615 # A vector conditional select instruction (like ?:, but operating per-
 616 # component on vectors). There are two versions, one for floating point
 617 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 618
 619
 620 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 621 opcode("bcsel", 0, tuint, [0, 0, 0],
 622       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 623
 624 # SM5 bfi assembly
 625 triop("bfi", tuint32, """
 626 unsigned mask = src0, insert = src1, base = src2;
 627 if (mask == 0) {
 628    dst = base;
 629 } else {
 630    unsigned tmp = mask;
 631    while (!(tmp & 1)) {
 632       tmp >>= 1;
 633       insert <<= 1;
 634    }
 635    dst = (base & ~mask) | (insert & mask);
 636 }
 637 """)
 638
 639 # SM5 ubfe/ibfe assembly
 640 opcode("ubfe", 0, tuint32,
 641        [0, 0, 0], [tuint32, tint32, tint32], "", """
 642 unsigned base = src0;
 643 int offset = src1, bits = src2;
 644 if (bits == 0) {
 645    dst = 0;
 646 } else if (bits < 0 || offset < 0) {
 647    dst = 0; /* undefined */
 648 } else if (offset + bits < 32) {
 649    dst = (base << (32 - bits - offset)) >> (32 - bits);
 650 } else {
 651    dst = base >> offset;
 652 }
 653 """)
 654 opcode("ibfe", 0, tint32,
 655        [0, 0, 0], [tint32, tint32, tint32], "", """
 656 int base = src0;
 657 int offset = src1, bits = src2;
 658 if (bits == 0) {
 659    dst = 0;
 660 } else if (bits < 0 || offset < 0) {
 661    dst = 0; /* undefined */
 662 } else if (offset + bits < 32) {
 663    dst = (base << (32 - bits - offset)) >> (32 - bits);
 664 } else {
 665    dst = base >> offset;
 666 }
 667 """)
 668
 669 # GLSL bitfieldExtract()
 670 opcode("ubitfield_extract", 0, tuint32,
 671        [0, 0, 0], [tuint32, tint32, tint32], "", """
 672 unsigned base = src0;
 673 int offset = src1, bits = src2;
 674 if (bits == 0) {
 675    dst = 0;
 676 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 677    dst = 0; /* undefined per the spec */
 678 } else {
 679    dst = (base >> offset) & ((1ull << bits) - 1);
 680 }
 681 """)
 682 opcode("ibitfield_extract", 0, tint32,
 683        [0, 0, 0], [tint32, tint32, tint32], "", """
 684 int base = src0;
 685 int offset = src1, bits = src2;
 686 if (bits == 0) {
 687    dst = 0;
 688 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 689    dst = 0;
 690 } else {
 691    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 692 }
 693 """)
 694
 695 # Combines the first component of each input to make a 3-component vector.
 696
 697 triop_horiz("vec3", 3, 1, 1, 1, """
 698 dst.x = src0.x;
 699 dst.y = src1.x;
 700 dst.z = src2.x;
 701 """)
 702
 703 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 704                  src4_size, const_expr):
 705    opcode(name, output_size, tuint,
 706           [src1_size, src2_size, src3_size, src4_size],
 707           [tuint, tuint, tuint, tuint],
 708           "", const_expr)
 709
 710 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 711        [tuint32, tuint32, tint32, tint32], "", """
 712 unsigned base = src0, insert = src1;
 713 int offset = src2, bits = src3;
 714 if (bits == 0) {
 715    dst = 0;
 716 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 717    dst = 0;
 718 } else {
 719    unsigned mask = ((1ull << bits) - 1) << offset;
 720    dst = (base & ~mask) | ((insert << bits) & mask);
 721 }
 722 """)
 723
 724 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 725 dst.x = src0.x;
 726 dst.y = src1.x;
 727 dst.z = src2.x;
 728 dst.w = src3.x;
 729 """)
 730
 731