src/compiler/nir/nir_opcodes.py

   1 #! /usr/bin/env python
   2 #
   3 # Copyright (C) 2014 Connor Abbott
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a
   6 # copy of this software and associated documentation files (the "Software"),
   7 # to deal in the Software without restriction, including without limitation
   8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 # and/or sell copies of the Software, and to permit persons to whom the
  10 # Software is furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice (including the next
  13 # paragraph) shall be included in all copies or substantial portions of the
  14 # Software.
  15 #
  16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22 # IN THE SOFTWARE.
  23 #
  24 # Authors:
  25 #    Connor Abbott (cwabbott0@gmail.com)
  26
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool32"
  94 tuint = "uint"
  95 tfloat32 = "float32"
  96 tint32 = "int32"
  97 tuint32 = "uint32"
  98 tuint64 = "uint64"
  99 tfloat64 = "float64"
 100
 101 commutative = "commutative "
 102 associative = "associative "
 103
 104 # global dictionary of opcodes
 105 opcodes = {}
 106
 107 def opcode(name, output_size, output_type, input_sizes, input_types,
 108            algebraic_properties, const_expr):
 109    assert name not in opcodes
 110    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 111                           input_types, algebraic_properties, const_expr)
 112
 113 def unop_convert(name, out_type, in_type, const_expr):
 114    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 115
 116 def unop(name, ty, const_expr):
 117    opcode(name, 0, ty, [0], [ty], "", const_expr)
 118
 119 def unop_horiz(name, output_size, output_type, input_size, input_type,
 120                const_expr):
 121    opcode(name, output_size, output_type, [input_size], [input_type], "",
 122           const_expr)
 123
 124 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 125                 reduce_expr, final_expr):
 126    def prereduce(src):
 127       return "(" + prereduce_expr.format(src=src) + ")"
 128    def final(src):
 129       return final_expr.format(src="(" + src + ")")
 130    def reduce_(src0, src1):
 131       return reduce_expr.format(src0=src0, src1=src1)
 132    src0 = prereduce("src0.x")
 133    src1 = prereduce("src0.y")
 134    src2 = prereduce("src0.z")
 135    src3 = prereduce("src0.w")
 136    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 137               final(reduce_(src0, src1)))
 138    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 139               final(reduce_(reduce_(src0, src1), src2)))
 140    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 141               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 142
 143
 144 # These two move instructions differ in what modifiers they support and what
 145 # the negate modifier means. Otherwise, they are identical.
 146 unop("fmov", tfloat, "src0")
 147 unop("imov", tint, "src0")
 148
 149 unop("ineg", tint, "-src0")
 150 unop("fneg", tfloat, "-src0")
 151 unop("inot", tint, "~src0") # invert every bit of the integer
 152 unop("fnot", tfloat, "(src0 == 0.0f) ? 1.0f : 0.0f")
 153 unop("fsign", tfloat, "(src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f)")
 154 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 155 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 156 unop("fabs", tfloat, "fabsf(src0)")
 157 unop("fsat", tfloat, "(src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0)")
 158 unop("frcp", tfloat, "1.0f / src0")
 159 unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 160 unop("fsqrt", tfloat, "sqrtf(src0)")
 161 unop("fexp2", tfloat, "exp2f(src0)")
 162 unop("flog2", tfloat, "log2f(src0)")
 163 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
 164 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
 165 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
 166 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
 167 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 168 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
 169 # Float-to-boolean conversion
 170 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 171 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
 172 # Boolean-to-float conversion
 173 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 174 # Int-to-boolean conversion
 175 unop_convert("i2b", tbool, tint32, "src0 != 0")
 176 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
 177 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 178 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
 179 # double-to-float conversion
 180 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
 181 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
 182
 183 # Unary floating-point rounding operations.
 184
 185
 186 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 187 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 188 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 189 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 190 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 191
 192 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 193
 194 # Trigonometric operations.
 195
 196
 197 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 198 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 199
 200
 201 # Partial derivatives.
 202
 203
 204 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 205 unop("fddy", tfloat, "0.0")
 206 unop("fddx_fine", tfloat, "0.0")
 207 unop("fddy_fine", tfloat, "0.0")
 208 unop("fddx_coarse", tfloat, "0.0")
 209 unop("fddy_coarse", tfloat, "0.0")
 210
 211
 212 # Floating point pack and unpack operations.
 213
 214 def pack_2x16(fmt):
 215    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 216 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 217 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 218 """.replace("fmt", fmt))
 219
 220 def pack_4x8(fmt):
 221    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 222 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 223 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 224 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 225 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 226 """.replace("fmt", fmt))
 227
 228 def unpack_2x16(fmt):
 229    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 230 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 231 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 232 """.replace("fmt", fmt))
 233
 234 def unpack_4x8(fmt):
 235    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 236 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 237 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 238 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 239 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 240 """.replace("fmt", fmt))
 241
 242
 243 pack_2x16("snorm")
 244 pack_4x8("snorm")
 245 pack_2x16("unorm")
 246 pack_4x8("unorm")
 247 pack_2x16("half")
 248 unpack_2x16("snorm")
 249 unpack_4x8("snorm")
 250 unpack_2x16("unorm")
 251 unpack_4x8("unorm")
 252 unpack_2x16("half")
 253
 254 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 255 dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 256 """)
 257
 258 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 259 dst.x = (src0.x <<  0) |
 260         (src0.y <<  8) |
 261         (src0.z << 16) |
 262         (src0.w << 24);
 263 """)
 264
 265 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
 266            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 267
 268 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
 269            "dst.x = src0.x; dst.y = src0.x >> 32;")
 270
 271 # Lowered floating point unpacking operations.
 272
 273
 274 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
 275            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
 276 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
 277            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 278
 279 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
 280 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 281
 282 # Bit operations, part of ARB_gpu_shader5.
 283
 284
 285 unop("bitfield_reverse", tuint32, """
 286 /* we're not winning any awards for speed here, but that's ok */
 287 dst = 0;
 288 for (unsigned bit = 0; bit < 32; bit++)
 289    dst |= ((src0 >> bit) & 1) << (31 - bit);
 290 """)
 291 unop("bit_count", tuint32, """
 292 dst = 0;
 293 for (unsigned bit = 0; bit < 32; bit++) {
 294    if ((src0 >> bit) & 1)
 295       dst++;
 296 }
 297 """)
 298
 299 unop_convert("ufind_msb", tint32, tuint32, """
 300 dst = -1;
 301 for (int bit = 31; bit > 0; bit--) {
 302    if ((src0 >> bit) & 1) {
 303       dst = bit;
 304       break;
 305    }
 306 }
 307 """)
 308
 309 unop("ifind_msb", tint32, """
 310 dst = -1;
 311 for (int bit = 31; bit >= 0; bit--) {
 312    /* If src0 < 0, we're looking for the first 0 bit.
 313     * if src0 >= 0, we're looking for the first 1 bit.
 314     */
 315    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 316       (!((src0 >> bit) & 1) && (src0 < 0))) {
 317       dst = bit;
 318       break;
 319    }
 320 }
 321 """)
 322
 323 unop("find_lsb", tint32, """
 324 dst = -1;
 325 for (unsigned bit = 0; bit < 32; bit++) {
 326    if ((src0 >> bit) & 1) {
 327       dst = bit;
 328       break;
 329    }
 330 }
 331 """)
 332
 333
 334 for i in xrange(1, 5):
 335    for j in xrange(1, 5):
 336       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 337
 338 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 339    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 340
 341 def binop(name, ty, alg_props, const_expr):
 342    binop_convert(name, ty, ty, alg_props, const_expr)
 343
 344 def binop_compare(name, ty, alg_props, const_expr):
 345    binop_convert(name, tbool, ty, alg_props, const_expr)
 346
 347 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 348                 src2_type, const_expr):
 349    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 350           "", const_expr)
 351
 352 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 353                  reduce_expr, final_expr):
 354    def final(src):
 355       return final_expr.format(src= "(" + src + ")")
 356    def reduce_(src0, src1):
 357       return reduce_expr.format(src0=src0, src1=src1)
 358    def prereduce(src0, src1):
 359       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 360    src0 = prereduce("src0.x", "src1.x")
 361    src1 = prereduce("src0.y", "src1.y")
 362    src2 = prereduce("src0.z", "src1.z")
 363    src3 = prereduce("src0.w", "src1.w")
 364    opcode(name + "2", output_size, output_type,
 365           [2, 2], [src_type, src_type], commutative,
 366           final(reduce_(src0, src1)))
 367    opcode(name + "3", output_size, output_type,
 368           [3, 3], [src_type, src_type], commutative,
 369           final(reduce_(reduce_(src0, src1), src2)))
 370    opcode(name + "4", output_size, output_type,
 371           [4, 4], [src_type, src_type], commutative,
 372           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 373
 374 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 375 binop("iadd", tint, commutative + associative, "src0 + src1")
 376 binop("fsub", tfloat, "", "src0 - src1")
 377 binop("isub", tint, "", "src0 - src1")
 378
 379 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 380 # low 32-bits of signed/unsigned integer multiply
 381 binop("imul", tint, commutative + associative, "src0 * src1")
 382 # high 32-bits of signed integer multiply
 383 binop("imul_high", tint32, commutative,
 384       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 385 # high 32-bits of unsigned integer multiply
 386 binop("umul_high", tuint32, commutative,
 387       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 388
 389 binop("fdiv", tfloat, "", "src0 / src1")
 390 binop("idiv", tint, "", "src0 / src1")
 391 binop("udiv", tuint, "", "src0 / src1")
 392
 393 # returns a boolean representing the carry resulting from the addition of
 394 # the two unsigned arguments.
 395
 396 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 397
 398 # returns a boolean representing the borrow resulting from the subtraction
 399 # of the two unsigned arguments.
 400
 401 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 402
 403 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 404
 405 # For signed integers, there are several different possible definitions of
 406 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 407 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 408 # operation while the imod opcode implements the more mathematical
 409 # "modulus" operation.  For details on the difference, see
 410 #
 411 # http://mathforum.org/library/drmath/view/52343.html
 412
 413 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 414 binop("imod", tint, "",
 415       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 416       "                 src0 % src1 : src0 % src1 + src1)")
 417 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 418 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 419
 420 #
 421 # Comparisons
 422 #
 423
 424
 425 # these integer-aware comparisons return a boolean (0 or ~0)
 426
 427 binop_compare("flt", tfloat, "", "src0 < src1")
 428 binop_compare("fge", tfloat, "", "src0 >= src1")
 429 binop_compare("feq", tfloat, commutative, "src0 == src1")
 430 binop_compare("fne", tfloat, commutative, "src0 != src1")
 431 binop_compare("ilt", tint, "", "src0 < src1")
 432 binop_compare("ige", tint, "", "src0 >= src1")
 433 binop_compare("ieq", tint, commutative, "src0 == src1")
 434 binop_compare("ine", tint, commutative, "src0 != src1")
 435 binop_compare("ult", tuint, "", "src0 < src1")
 436 binop_compare("uge", tuint, "", "src0 >= src1")
 437
 438 # integer-aware GLSL-style comparisons that compare floats and ints
 439
 440 binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
 441              "{src0} && {src1}", "{src}")
 442 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
 443              "{src0} || {src1}", "{src}")
 444 binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
 445              "{src0} && {src1}", "{src}")
 446 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 447              "{src0} || {src1}", "{src}")
 448
 449 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 450
 451 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 452              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 453 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 454              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 455
 456 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 457 # and false respectively
 458
 459 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 460 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 461 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 462 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 463
 464
 465 binop("ishl", tint, "", "src0 << src1")
 466 binop("ishr", tint, "", "src0 >> src1")
 467 binop("ushr", tuint, "", "src0 >> src1")
 468
 469 # bitwise logic operators
 470 #
 471 # These are also used as boolean and, or, xor for hardware supporting
 472 # integers.
 473
 474
 475 binop("iand", tuint, commutative + associative, "src0 & src1")
 476 binop("ior", tuint, commutative + associative, "src0 | src1")
 477 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 478
 479
 480 # floating point logic operators
 481 #
 482 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 483 # for true and 0.0 for false
 484
 485 binop("fand", tfloat32, commutative,
 486       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 487 binop("for", tfloat32, commutative,
 488       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 489 binop("fxor", tfloat32, commutative,
 490       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 491
 492 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 493              "{src}")
 494
 495 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 496              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 497
 498 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 499        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 500 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 501        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 502
 503 binop("fmin", tfloat, "", "fminf(src0, src1)")
 504 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 505 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 506 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 507 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 508 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 509
 510 # Saturated vector add for 4 8bit ints.
 511 binop("usadd_4x8", tint32, commutative + associative, """
 512 dst = 0;
 513 for (int i = 0; i < 32; i += 8) {
 514    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 515 }
 516 """)
 517
 518 # Saturated vector subtract for 4 8bit ints.
 519 binop("ussub_4x8", tint32, "", """
 520 dst = 0;
 521 for (int i = 0; i < 32; i += 8) {
 522    int src0_chan = (src0 >> i) & 0xff;
 523    int src1_chan = (src1 >> i) & 0xff;
 524    if (src0_chan > src1_chan)
 525       dst |= (src0_chan - src1_chan) << i;
 526 }
 527 """)
 528
 529 # vector min for 4 8bit ints.
 530 binop("umin_4x8", tint32, commutative + associative, """
 531 dst = 0;
 532 for (int i = 0; i < 32; i += 8) {
 533    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 534 }
 535 """)
 536
 537 # vector max for 4 8bit ints.
 538 binop("umax_4x8", tint32, commutative + associative, """
 539 dst = 0;
 540 for (int i = 0; i < 32; i += 8) {
 541    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 542 }
 543 """)
 544
 545 # unorm multiply: (a * b) / 255.
 546 binop("umul_unorm_4x8", tint32, commutative + associative, """
 547 dst = 0;
 548 for (int i = 0; i < 32; i += 8) {
 549    int src0_chan = (src0 >> i) & 0xff;
 550    int src1_chan = (src1 >> i) & 0xff;
 551    dst |= ((src0_chan * src1_chan) / 255) << i;
 552 }
 553 """)
 554
 555 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 556
 557 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 558             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 559
 560 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
 561               "src0 | ((uint64_t)src1 << 32)")
 562
 563 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 564 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 565 # if either of its arguments are 32.
 566 binop_convert("bfm", tuint32, tint32, "", """
 567 int bits = src0, offset = src1;
 568 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 569    dst = 0; /* undefined */
 570 else
 571    dst = ((1u << bits) - 1) << offset;
 572 """)
 573
 574 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
 575 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 576 /* flush denormals to zero. */
 577 if (!isnormal(dst))
 578    dst = copysignf(0.0f, src0);
 579 """)
 580
 581 # Combines the first component of each input to make a 2-component vector.
 582
 583 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 584 dst.x = src0.x;
 585 dst.y = src1.x;
 586 """)
 587
 588 # Byte extraction
 589 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 590 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 591
 592 # Word extraction
 593 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 594 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 595
 596
 597 def triop(name, ty, const_expr):
 598    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 599 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 600    opcode(name, output_size, tuint,
 601    [src1_size, src2_size, src3_size],
 602    [tuint, tuint, tuint], "", const_expr)
 603
 604 triop("ffma", tfloat, "src0 * src1 + src2")
 605
 606 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 607
 608 # Conditional Select
 609 #
 610 # A vector conditional select instruction (like ?:, but operating per-
 611 # component on vectors). There are two versions, one for floating point
 612 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 613
 614
 615 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 616 opcode("bcsel", 0, tuint, [0, 0, 0],
 617       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 618
 619 # SM5 bfi assembly
 620 triop("bfi", tuint32, """
 621 unsigned mask = src0, insert = src1, base = src2;
 622 if (mask == 0) {
 623    dst = base;
 624 } else {
 625    unsigned tmp = mask;
 626    while (!(tmp & 1)) {
 627       tmp >>= 1;
 628       insert <<= 1;
 629    }
 630    dst = (base & ~mask) | (insert & mask);
 631 }
 632 """)
 633
 634 # SM5 ubfe/ibfe assembly
 635 opcode("ubfe", 0, tuint32,
 636        [0, 0, 0], [tuint32, tint32, tint32], "", """
 637 unsigned base = src0;
 638 int offset = src1, bits = src2;
 639 if (bits == 0) {
 640    dst = 0;
 641 } else if (bits < 0 || offset < 0) {
 642    dst = 0; /* undefined */
 643 } else if (offset + bits < 32) {
 644    dst = (base << (32 - bits - offset)) >> (32 - bits);
 645 } else {
 646    dst = base >> offset;
 647 }
 648 """)
 649 opcode("ibfe", 0, tint32,
 650        [0, 0, 0], [tint32, tint32, tint32], "", """
 651 int base = src0;
 652 int offset = src1, bits = src2;
 653 if (bits == 0) {
 654    dst = 0;
 655 } else if (bits < 0 || offset < 0) {
 656    dst = 0; /* undefined */
 657 } else if (offset + bits < 32) {
 658    dst = (base << (32 - bits - offset)) >> (32 - bits);
 659 } else {
 660    dst = base >> offset;
 661 }
 662 """)
 663
 664 # GLSL bitfieldExtract()
 665 opcode("ubitfield_extract", 0, tuint32,
 666        [0, 0, 0], [tuint32, tint32, tint32], "", """
 667 unsigned base = src0;
 668 int offset = src1, bits = src2;
 669 if (bits == 0) {
 670    dst = 0;
 671 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 672    dst = 0; /* undefined per the spec */
 673 } else {
 674    dst = (base >> offset) & ((1ull << bits) - 1);
 675 }
 676 """)
 677 opcode("ibitfield_extract", 0, tint32,
 678        [0, 0, 0], [tint32, tint32, tint32], "", """
 679 int base = src0;
 680 int offset = src1, bits = src2;
 681 if (bits == 0) {
 682    dst = 0;
 683 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 684    dst = 0;
 685 } else {
 686    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 687 }
 688 """)
 689
 690 # Combines the first component of each input to make a 3-component vector.
 691
 692 triop_horiz("vec3", 3, 1, 1, 1, """
 693 dst.x = src0.x;
 694 dst.y = src1.x;
 695 dst.z = src2.x;
 696 """)
 697
 698 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 699                  src4_size, const_expr):
 700    opcode(name, output_size, tuint,
 701           [src1_size, src2_size, src3_size, src4_size],
 702           [tuint, tuint, tuint, tuint],
 703           "", const_expr)
 704
 705 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 706        [tuint32, tuint32, tint32, tint32], "", """
 707 unsigned base = src0, insert = src1;
 708 int offset = src2, bits = src3;
 709 if (bits == 0) {
 710    dst = 0;
 711 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 712    dst = 0;
 713 } else {
 714    unsigned mask = ((1ull << bits) - 1) << offset;
 715    dst = (base & ~mask) | ((insert << bits) & mask);
 716 }
 717 """)
 718
 719 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 720 dst.x = src0.x;
 721 dst.y = src1.x;
 722 dst.z = src2.x;
 723 dst.w = src3.x;
 724 """)
 725
 726