src/compiler/nir/nir_opcodes.py

   1 #! /usr/bin/env python
   2 #
   3 # Copyright (C) 2014 Connor Abbott
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a
   6 # copy of this software and associated documentation files (the "Software"),
   7 # to deal in the Software without restriction, including without limitation
   8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 # and/or sell copies of the Software, and to permit persons to whom the
  10 # Software is furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice (including the next
  13 # paragraph) shall be included in all copies or substantial portions of the
  14 # Software.
  15 #
  16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22 # IN THE SOFTWARE.
  23 #
  24 # Authors:
  25 #    Connor Abbott (cwabbott0@gmail.com)
  26
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool32"
  94 tuint = "uint"
  95 tfloat32 = "float32"
  96 tint32 = "int32"
  97 tuint32 = "uint32"
  98 tint64 = "int64"
  99 tuint64 = "uint64"
 100 tfloat64 = "float64"
 101
 102 commutative = "commutative "
 103 associative = "associative "
 104
 105 # global dictionary of opcodes
 106 opcodes = {}
 107
 108 def opcode(name, output_size, output_type, input_sizes, input_types,
 109            algebraic_properties, const_expr):
 110    assert name not in opcodes
 111    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 112                           input_types, algebraic_properties, const_expr)
 113
 114 def unop_convert(name, out_type, in_type, const_expr):
 115    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 116
 117 def unop(name, ty, const_expr):
 118    opcode(name, 0, ty, [0], [ty], "", const_expr)
 119
 120 def unop_horiz(name, output_size, output_type, input_size, input_type,
 121                const_expr):
 122    opcode(name, output_size, output_type, [input_size], [input_type], "",
 123           const_expr)
 124
 125 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 126                 reduce_expr, final_expr):
 127    def prereduce(src):
 128       return "(" + prereduce_expr.format(src=src) + ")"
 129    def final(src):
 130       return final_expr.format(src="(" + src + ")")
 131    def reduce_(src0, src1):
 132       return reduce_expr.format(src0=src0, src1=src1)
 133    src0 = prereduce("src0.x")
 134    src1 = prereduce("src0.y")
 135    src2 = prereduce("src0.z")
 136    src3 = prereduce("src0.w")
 137    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 138               final(reduce_(src0, src1)))
 139    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 140               final(reduce_(reduce_(src0, src1), src2)))
 141    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 142               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 143
 144
 145 # These two move instructions differ in what modifiers they support and what
 146 # the negate modifier means. Otherwise, they are identical.
 147 unop("fmov", tfloat, "src0")
 148 unop("imov", tint, "src0")
 149
 150 unop("ineg", tint, "-src0")
 151 unop("fneg", tfloat, "-src0")
 152 unop("inot", tint, "~src0") # invert every bit of the integer
 153 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
 154                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
 155 unop("fsign", tfloat, ("bit_size == 64 ? " +
 156                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 157                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 158 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 159 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 160 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
 161 unop("fsat", tfloat, ("bit_size == 64 ? " +
 162                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 163                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 164 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 165 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 166 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 167 unop("fexp2", tfloat, "exp2f(src0)")
 168 unop("flog2", tfloat, "log2f(src0)")
 169 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
 170 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
 171 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
 172 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
 173 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 174 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
 175 unop_convert("i2i32", tint32, tint, "src0")    # General int (int8_t, int64_t, etc.) to int32_t conversion
 176 unop_convert("u2i32", tint32, tuint, "src0")   # General uint (uint8_t, uint64_t, etc.) to int32_t conversion
 177 unop_convert("i2u32", tuint32, tint, "src0")   # General int (int8_t, int64_t, etc.) to uint32_t conversion
 178 unop_convert("u2u32", tuint32, tuint, "src0")  # General uint (uint8_t, uint64_t, etc.) to uint32_t conversion
 179 unop_convert("i2i64", tint64, tint, "src0")    # General int (int8_t, int32_t, etc.) to int64_t conversion
 180 unop_convert("u2i64", tint64, tuint, "src0")   # General uint (uint8_t, uint64_t, etc.) to int64_t conversion
 181 unop_convert("f2i64", tint64, tfloat, "src0")  # General float (float or double) to int64_t conversion
 182 unop_convert("i2u64", tuint64, tint,  "src0")  # General int (int8_t, int64_t, etc.) to uint64_t conversion
 183 unop_convert("u2u64", tuint64, tuint, "src0")  # General uint (uint8_t, uint32_t, etc.) to uint64_t conversion
 184 unop_convert("f2u64", tuint64, tfloat, "src0") # General float (float or double) to uint64_t conversion
 185 unop_convert("i642f", tfloat32, tint64, "src0")  # int64_t-to-float conversion.
 186 unop_convert("i642b", tbool, tint64, "src0")  # int64_t-to-bool conversion.
 187 unop_convert("i642d", tfloat64, tint64, "src0")  # int64_t-to-double conversion.
 188 unop_convert("u642f", tfloat32, tuint64, "src0") # uint64_t-to-float conversion.
 189 unop_convert("u642d", tfloat64, tuint64, "src0") # uint64_t-to-double conversion.
 190
 191 # Float-to-boolean conversion
 192 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 193 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
 194 # Boolean-to-float conversion
 195 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 196 # Int-to-boolean conversion
 197 unop_convert("i2b", tbool, tint, "src0 != 0")
 198 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
 199 unop_convert("b2i64", tint64, tbool, "src0 ? 1 : 0")  # Boolean-to-int64_t conversion.
 200 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 201 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
 202 # double-to-float conversion
 203 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
 204 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
 205
 206 # Unary floating-point rounding operations.
 207
 208
 209 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 210 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 211 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 212 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 213 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 214
 215 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 216
 217 # Trigonometric operations.
 218
 219
 220 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 221 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 222
 223
 224 # Partial derivatives.
 225
 226
 227 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 228 unop("fddy", tfloat, "0.0")
 229 unop("fddx_fine", tfloat, "0.0")
 230 unop("fddy_fine", tfloat, "0.0")
 231 unop("fddx_coarse", tfloat, "0.0")
 232 unop("fddy_coarse", tfloat, "0.0")
 233
 234
 235 # Floating point pack and unpack operations.
 236
 237 def pack_2x16(fmt):
 238    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 239 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 240 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 241 """.replace("fmt", fmt))
 242
 243 def pack_4x8(fmt):
 244    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 245 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 246 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 247 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 248 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 249 """.replace("fmt", fmt))
 250
 251 def unpack_2x16(fmt):
 252    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 253 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 254 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 255 """.replace("fmt", fmt))
 256
 257 def unpack_4x8(fmt):
 258    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 259 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 260 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 261 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 262 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 263 """.replace("fmt", fmt))
 264
 265
 266 pack_2x16("snorm")
 267 pack_4x8("snorm")
 268 pack_2x16("unorm")
 269 pack_4x8("unorm")
 270 pack_2x16("half")
 271 unpack_2x16("snorm")
 272 unpack_4x8("snorm")
 273 unpack_2x16("unorm")
 274 unpack_4x8("unorm")
 275 unpack_2x16("half")
 276
 277 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 278 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 279 """)
 280
 281 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 282 dst.x = (src0.x <<  0) |
 283         (src0.y <<  8) |
 284         (src0.z << 16) |
 285         (src0.w << 24);
 286 """)
 287
 288 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
 289            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 290
 291 unop_horiz("pack_int_2x32", 1, tint64, 2, tint32,
 292            "dst.x = src0.x | ((int64_t)src0.y << 32);")
 293
 294 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
 295            "dst.x = src0.x; dst.y = src0.x >> 32;")
 296
 297 unop_horiz("unpack_int_2x32", 2, tint32, 1, tint64,
 298            "dst.x = src0.x; dst.y = src0.x >> 32;")
 299
 300 # Lowered floating point unpacking operations.
 301
 302
 303 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
 304            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
 305 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
 306            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 307
 308 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
 309 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 310 unop_convert("unpack_int_2x32_split_x", tuint32, tuint64, "src0")
 311 unop_convert("unpack_int_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 312
 313 # Bit operations, part of ARB_gpu_shader5.
 314
 315
 316 unop("bitfield_reverse", tuint32, """
 317 /* we're not winning any awards for speed here, but that's ok */
 318 dst = 0;
 319 for (unsigned bit = 0; bit < 32; bit++)
 320    dst |= ((src0 >> bit) & 1) << (31 - bit);
 321 """)
 322 unop("bit_count", tuint32, """
 323 dst = 0;
 324 for (unsigned bit = 0; bit < 32; bit++) {
 325    if ((src0 >> bit) & 1)
 326       dst++;
 327 }
 328 """)
 329
 330 unop_convert("ufind_msb", tint32, tuint32, """
 331 dst = -1;
 332 for (int bit = 31; bit > 0; bit--) {
 333    if ((src0 >> bit) & 1) {
 334       dst = bit;
 335       break;
 336    }
 337 }
 338 """)
 339
 340 unop("ifind_msb", tint32, """
 341 dst = -1;
 342 for (int bit = 31; bit >= 0; bit--) {
 343    /* If src0 < 0, we're looking for the first 0 bit.
 344     * if src0 >= 0, we're looking for the first 1 bit.
 345     */
 346    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 347       (!((src0 >> bit) & 1) && (src0 < 0))) {
 348       dst = bit;
 349       break;
 350    }
 351 }
 352 """)
 353
 354 unop("find_lsb", tint32, """
 355 dst = -1;
 356 for (unsigned bit = 0; bit < 32; bit++) {
 357    if ((src0 >> bit) & 1) {
 358       dst = bit;
 359       break;
 360    }
 361 }
 362 """)
 363
 364
 365 for i in xrange(1, 5):
 366    for j in xrange(1, 5):
 367       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 368
 369 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 370    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 371
 372 def binop(name, ty, alg_props, const_expr):
 373    binop_convert(name, ty, ty, alg_props, const_expr)
 374
 375 def binop_compare(name, ty, alg_props, const_expr):
 376    binop_convert(name, tbool, ty, alg_props, const_expr)
 377
 378 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 379                 src2_type, const_expr):
 380    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 381           "", const_expr)
 382
 383 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 384                  reduce_expr, final_expr):
 385    def final(src):
 386       return final_expr.format(src= "(" + src + ")")
 387    def reduce_(src0, src1):
 388       return reduce_expr.format(src0=src0, src1=src1)
 389    def prereduce(src0, src1):
 390       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 391    src0 = prereduce("src0.x", "src1.x")
 392    src1 = prereduce("src0.y", "src1.y")
 393    src2 = prereduce("src0.z", "src1.z")
 394    src3 = prereduce("src0.w", "src1.w")
 395    opcode(name + "2", output_size, output_type,
 396           [2, 2], [src_type, src_type], commutative,
 397           final(reduce_(src0, src1)))
 398    opcode(name + "3", output_size, output_type,
 399           [3, 3], [src_type, src_type], commutative,
 400           final(reduce_(reduce_(src0, src1), src2)))
 401    opcode(name + "4", output_size, output_type,
 402           [4, 4], [src_type, src_type], commutative,
 403           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 404
 405 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 406 binop("iadd", tint, commutative + associative, "src0 + src1")
 407 binop("fsub", tfloat, "", "src0 - src1")
 408 binop("isub", tint, "", "src0 - src1")
 409
 410 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 411 # low 32-bits of signed/unsigned integer multiply
 412 binop("imul", tint, commutative + associative, "src0 * src1")
 413 # high 32-bits of signed integer multiply
 414 binop("imul_high", tint32, commutative,
 415       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 416 # high 32-bits of unsigned integer multiply
 417 binop("umul_high", tuint32, commutative,
 418       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 419
 420 binop("fdiv", tfloat, "", "src0 / src1")
 421 binop("idiv", tint, "", "src0 / src1")
 422 binop("udiv", tuint, "", "src0 / src1")
 423
 424 # returns a boolean representing the carry resulting from the addition of
 425 # the two unsigned arguments.
 426
 427 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 428
 429 # returns a boolean representing the borrow resulting from the subtraction
 430 # of the two unsigned arguments.
 431
 432 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 433
 434 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 435
 436 # For signed integers, there are several different possible definitions of
 437 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 438 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 439 # operation while the imod opcode implements the more mathematical
 440 # "modulus" operation.  For details on the difference, see
 441 #
 442 # http://mathforum.org/library/drmath/view/52343.html
 443
 444 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 445 binop("imod", tint, "",
 446       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 447       "                 src0 % src1 : src0 % src1 + src1)")
 448 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 449 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 450
 451 #
 452 # Comparisons
 453 #
 454
 455
 456 # these integer-aware comparisons return a boolean (0 or ~0)
 457
 458 binop_compare("flt", tfloat, "", "src0 < src1")
 459 binop_compare("fge", tfloat, "", "src0 >= src1")
 460 binop_compare("feq", tfloat, commutative, "src0 == src1")
 461 binop_compare("fne", tfloat, commutative, "src0 != src1")
 462 binop_compare("ilt", tint, "", "src0 < src1")
 463 binop_compare("ige", tint, "", "src0 >= src1")
 464 binop_compare("ieq", tint, commutative, "src0 == src1")
 465 binop_compare("ine", tint, commutative, "src0 != src1")
 466 binop_compare("ult", tuint, "", "src0 < src1")
 467 binop_compare("uge", tuint, "", "src0 >= src1")
 468
 469 # integer-aware GLSL-style comparisons that compare floats and ints
 470
 471 binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
 472              "{src0} && {src1}", "{src}")
 473 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
 474              "{src0} || {src1}", "{src}")
 475 binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
 476              "{src0} && {src1}", "{src}")
 477 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 478              "{src0} || {src1}", "{src}")
 479
 480 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 481
 482 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 483              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 484 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 485              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 486
 487 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 488 # and false respectively
 489
 490 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 491 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 492 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 493 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 494
 495
 496 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
 497 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
 498 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
 499
 500 # bitwise logic operators
 501 #
 502 # These are also used as boolean and, or, xor for hardware supporting
 503 # integers.
 504
 505
 506 binop("iand", tuint, commutative + associative, "src0 & src1")
 507 binop("ior", tuint, commutative + associative, "src0 | src1")
 508 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 509
 510
 511 # floating point logic operators
 512 #
 513 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 514 # for true and 0.0 for false
 515
 516 binop("fand", tfloat32, commutative,
 517       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 518 binop("for", tfloat32, commutative,
 519       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 520 binop("fxor", tfloat32, commutative,
 521       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 522
 523 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 524              "{src}")
 525
 526 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 527              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 528
 529 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 530        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 531 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 532        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 533
 534 binop("fmin", tfloat, "", "fminf(src0, src1)")
 535 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 536 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 537 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 538 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 539 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 540
 541 # Saturated vector add for 4 8bit ints.
 542 binop("usadd_4x8", tint32, commutative + associative, """
 543 dst = 0;
 544 for (int i = 0; i < 32; i += 8) {
 545    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 546 }
 547 """)
 548
 549 # Saturated vector subtract for 4 8bit ints.
 550 binop("ussub_4x8", tint32, "", """
 551 dst = 0;
 552 for (int i = 0; i < 32; i += 8) {
 553    int src0_chan = (src0 >> i) & 0xff;
 554    int src1_chan = (src1 >> i) & 0xff;
 555    if (src0_chan > src1_chan)
 556       dst |= (src0_chan - src1_chan) << i;
 557 }
 558 """)
 559
 560 # vector min for 4 8bit ints.
 561 binop("umin_4x8", tint32, commutative + associative, """
 562 dst = 0;
 563 for (int i = 0; i < 32; i += 8) {
 564    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 565 }
 566 """)
 567
 568 # vector max for 4 8bit ints.
 569 binop("umax_4x8", tint32, commutative + associative, """
 570 dst = 0;
 571 for (int i = 0; i < 32; i += 8) {
 572    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 573 }
 574 """)
 575
 576 # unorm multiply: (a * b) / 255.
 577 binop("umul_unorm_4x8", tint32, commutative + associative, """
 578 dst = 0;
 579 for (int i = 0; i < 32; i += 8) {
 580    int src0_chan = (src0 >> i) & 0xff;
 581    int src1_chan = (src1 >> i) & 0xff;
 582    dst |= ((src0_chan * src1_chan) / 255) << i;
 583 }
 584 """)
 585
 586 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 587
 588 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 589             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 590
 591 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
 592               "src0 | ((uint64_t)src1 << 32)")
 593
 594 binop_convert("pack_int_2x32_split", tuint64, tuint32, "",
 595               "src0 | ((uint64_t)src1 << 32)")
 596
 597 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 598 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 599 # if either of its arguments are 32.
 600 binop_convert("bfm", tuint32, tint32, "", """
 601 int bits = src0, offset = src1;
 602 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 603    dst = 0; /* undefined */
 604 else
 605    dst = ((1u << bits) - 1) << offset;
 606 """)
 607
 608 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
 609 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 610 /* flush denormals to zero. */
 611 if (!isnormal(dst))
 612    dst = copysignf(0.0f, src0);
 613 """)
 614
 615 # Combines the first component of each input to make a 2-component vector.
 616
 617 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 618 dst.x = src0.x;
 619 dst.y = src1.x;
 620 """)
 621
 622 # Byte extraction
 623 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 624 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 625
 626 # Word extraction
 627 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 628 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 629
 630
 631 def triop(name, ty, const_expr):
 632    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 633 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 634    opcode(name, output_size, tuint,
 635    [src1_size, src2_size, src3_size],
 636    [tuint, tuint, tuint], "", const_expr)
 637
 638 triop("ffma", tfloat, "src0 * src1 + src2")
 639
 640 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 641
 642 # Conditional Select
 643 #
 644 # A vector conditional select instruction (like ?:, but operating per-
 645 # component on vectors). There are two versions, one for floating point
 646 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 647
 648
 649 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 650 opcode("bcsel", 0, tuint, [0, 0, 0],
 651       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 652
 653 # SM5 bfi assembly
 654 triop("bfi", tuint32, """
 655 unsigned mask = src0, insert = src1, base = src2;
 656 if (mask == 0) {
 657    dst = base;
 658 } else {
 659    unsigned tmp = mask;
 660    while (!(tmp & 1)) {
 661       tmp >>= 1;
 662       insert <<= 1;
 663    }
 664    dst = (base & ~mask) | (insert & mask);
 665 }
 666 """)
 667
 668 # SM5 ubfe/ibfe assembly
 669 opcode("ubfe", 0, tuint32,
 670        [0, 0, 0], [tuint32, tint32, tint32], "", """
 671 unsigned base = src0;
 672 int offset = src1, bits = src2;
 673 if (bits == 0) {
 674    dst = 0;
 675 } else if (bits < 0 || offset < 0) {
 676    dst = 0; /* undefined */
 677 } else if (offset + bits < 32) {
 678    dst = (base << (32 - bits - offset)) >> (32 - bits);
 679 } else {
 680    dst = base >> offset;
 681 }
 682 """)
 683 opcode("ibfe", 0, tint32,
 684        [0, 0, 0], [tint32, tint32, tint32], "", """
 685 int base = src0;
 686 int offset = src1, bits = src2;
 687 if (bits == 0) {
 688    dst = 0;
 689 } else if (bits < 0 || offset < 0) {
 690    dst = 0; /* undefined */
 691 } else if (offset + bits < 32) {
 692    dst = (base << (32 - bits - offset)) >> (32 - bits);
 693 } else {
 694    dst = base >> offset;
 695 }
 696 """)
 697
 698 # GLSL bitfieldExtract()
 699 opcode("ubitfield_extract", 0, tuint32,
 700        [0, 0, 0], [tuint32, tint32, tint32], "", """
 701 unsigned base = src0;
 702 int offset = src1, bits = src2;
 703 if (bits == 0) {
 704    dst = 0;
 705 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 706    dst = 0; /* undefined per the spec */
 707 } else {
 708    dst = (base >> offset) & ((1ull << bits) - 1);
 709 }
 710 """)
 711 opcode("ibitfield_extract", 0, tint32,
 712        [0, 0, 0], [tint32, tint32, tint32], "", """
 713 int base = src0;
 714 int offset = src1, bits = src2;
 715 if (bits == 0) {
 716    dst = 0;
 717 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 718    dst = 0;
 719 } else {
 720    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 721 }
 722 """)
 723
 724 # Combines the first component of each input to make a 3-component vector.
 725
 726 triop_horiz("vec3", 3, 1, 1, 1, """
 727 dst.x = src0.x;
 728 dst.y = src1.x;
 729 dst.z = src2.x;
 730 """)
 731
 732 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 733                  src4_size, const_expr):
 734    opcode(name, output_size, tuint,
 735           [src1_size, src2_size, src3_size, src4_size],
 736           [tuint, tuint, tuint, tuint],
 737           "", const_expr)
 738
 739 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 740        [tuint32, tuint32, tint32, tint32], "", """
 741 unsigned base = src0, insert = src1;
 742 int offset = src2, bits = src3;
 743 if (bits == 0) {
 744    dst = 0;
 745 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 746    dst = 0;
 747 } else {
 748    unsigned mask = ((1ull << bits) - 1) << offset;
 749    dst = (base & ~mask) | ((insert << bits) & mask);
 750 }
 751 """)
 752
 753 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 754 dst.x = src0.x;
 755 dst.y = src1.x;
 756 dst.z = src2.x;
 757 dst.w = src3.x;
 758 """)
 759
 760