src/compiler/nir/nir_opcodes.py

   1 #! /usr/bin/env python
   2 #
   3 # Copyright (C) 2014 Connor Abbott
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a
   6 # copy of this software and associated documentation files (the "Software"),
   7 # to deal in the Software without restriction, including without limitation
   8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 # and/or sell copies of the Software, and to permit persons to whom the
  10 # Software is furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice (including the next
  13 # paragraph) shall be included in all copies or substantial portions of the
  14 # Software.
  15 #
  16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22 # IN THE SOFTWARE.
  23 #
  24 # Authors:
  25 #    Connor Abbott (cwabbott0@gmail.com)
  26
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool32"
  94 tuint = "uint"
  95 tfloat32 = "float32"
  96 tint32 = "int32"
  97 tuint32 = "uint32"
  98 tfloat64 = "float64"
  99
 100 commutative = "commutative "
 101 associative = "associative "
 102
 103 # global dictionary of opcodes
 104 opcodes = {}
 105
 106 def opcode(name, output_size, output_type, input_sizes, input_types,
 107            algebraic_properties, const_expr):
 108    assert name not in opcodes
 109    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 110                           input_types, algebraic_properties, const_expr)
 111
 112 def unop_convert(name, out_type, in_type, const_expr):
 113    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 114
 115 def unop(name, ty, const_expr):
 116    opcode(name, 0, ty, [0], [ty], "", const_expr)
 117
 118 def unop_horiz(name, output_size, output_type, input_size, input_type,
 119                const_expr):
 120    opcode(name, output_size, output_type, [input_size], [input_type], "",
 121           const_expr)
 122
 123 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 124                 reduce_expr, final_expr):
 125    def prereduce(src):
 126       return "(" + prereduce_expr.format(src=src) + ")"
 127    def final(src):
 128       return final_expr.format(src="(" + src + ")")
 129    def reduce_(src0, src1):
 130       return reduce_expr.format(src0=src0, src1=src1)
 131    src0 = prereduce("src0.x")
 132    src1 = prereduce("src0.y")
 133    src2 = prereduce("src0.z")
 134    src3 = prereduce("src0.w")
 135    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 136               final(reduce_(src0, src1)))
 137    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 138               final(reduce_(reduce_(src0, src1), src2)))
 139    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 140               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 141
 142
 143 # These two move instructions differ in what modifiers they support and what
 144 # the negate modifier means. Otherwise, they are identical.
 145 unop("fmov", tfloat, "src0")
 146 unop("imov", tint, "src0")
 147
 148 unop("ineg", tint, "-src0")
 149 unop("fneg", tfloat, "-src0")
 150 unop("inot", tint, "~src0") # invert every bit of the integer
 151 unop("fnot", tfloat, "(src0 == 0.0f) ? 1.0f : 0.0f")
 152 unop("fsign", tfloat, "(src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f)")
 153 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 154 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 155 unop("fabs", tfloat, "fabsf(src0)")
 156 unop("fsat", tfloat, "(src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0)")
 157 unop("frcp", tfloat, "1.0f / src0")
 158 unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 159 unop("fsqrt", tfloat, "sqrtf(src0)")
 160 unop("fexp2", tfloat, "exp2f(src0)")
 161 unop("flog2", tfloat, "log2f(src0)")
 162 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
 163 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
 164 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
 165 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
 166 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 167 # Float-to-boolean conversion
 168 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 169 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
 170 # Boolean-to-float conversion
 171 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 172 # Int-to-boolean conversion
 173 unop_convert("i2b", tbool, tint32, "src0 != 0")
 174 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
 175 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 176 # double-to-float conversion
 177 unop_convert("d2f", tfloat32, tfloat64, "src0") # Single to double precision
 178 unop_convert("f2d", tfloat64, tfloat32, "src0") # Double to single precision
 179
 180 # Unary floating-point rounding operations.
 181
 182
 183 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 184 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 185 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 186 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 187 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 188
 189 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 190
 191 # Trigonometric operations.
 192
 193
 194 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 195 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 196
 197
 198 # Partial derivatives.
 199
 200
 201 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 202 unop("fddy", tfloat, "0.0")
 203 unop("fddx_fine", tfloat, "0.0")
 204 unop("fddy_fine", tfloat, "0.0")
 205 unop("fddx_coarse", tfloat, "0.0")
 206 unop("fddy_coarse", tfloat, "0.0")
 207
 208
 209 # Floating point pack and unpack operations.
 210
 211 def pack_2x16(fmt):
 212    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 213 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 214 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 215 """.replace("fmt", fmt))
 216
 217 def pack_4x8(fmt):
 218    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 219 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 220 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 221 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 222 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 223 """.replace("fmt", fmt))
 224
 225 def unpack_2x16(fmt):
 226    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 227 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 228 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 229 """.replace("fmt", fmt))
 230
 231 def unpack_4x8(fmt):
 232    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 233 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 234 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 235 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 236 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 237 """.replace("fmt", fmt))
 238
 239
 240 pack_2x16("snorm")
 241 pack_4x8("snorm")
 242 pack_2x16("unorm")
 243 pack_4x8("unorm")
 244 pack_2x16("half")
 245 unpack_2x16("snorm")
 246 unpack_4x8("snorm")
 247 unpack_2x16("unorm")
 248 unpack_4x8("unorm")
 249 unpack_2x16("half")
 250
 251 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 252 dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 253 """)
 254
 255 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 256 dst.x = (src0.x <<  0) |
 257         (src0.y <<  8) |
 258         (src0.z << 16) |
 259         (src0.w << 24);
 260 """)
 261
 262 # Lowered floating point unpacking operations.
 263
 264
 265 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
 266            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
 267 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
 268            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 269
 270
 271 # Bit operations, part of ARB_gpu_shader5.
 272
 273
 274 unop("bitfield_reverse", tuint32, """
 275 /* we're not winning any awards for speed here, but that's ok */
 276 dst = 0;
 277 for (unsigned bit = 0; bit < 32; bit++)
 278    dst |= ((src0 >> bit) & 1) << (31 - bit);
 279 """)
 280 unop("bit_count", tuint32, """
 281 dst = 0;
 282 for (unsigned bit = 0; bit < 32; bit++) {
 283    if ((src0 >> bit) & 1)
 284       dst++;
 285 }
 286 """)
 287
 288 unop_convert("ufind_msb", tint32, tuint32, """
 289 dst = -1;
 290 for (int bit = 31; bit > 0; bit--) {
 291    if ((src0 >> bit) & 1) {
 292       dst = bit;
 293       break;
 294    }
 295 }
 296 """)
 297
 298 unop("ifind_msb", tint32, """
 299 dst = -1;
 300 for (int bit = 31; bit >= 0; bit--) {
 301    /* If src0 < 0, we're looking for the first 0 bit.
 302     * if src0 >= 0, we're looking for the first 1 bit.
 303     */
 304    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 305       (!((src0 >> bit) & 1) && (src0 < 0))) {
 306       dst = bit;
 307       break;
 308    }
 309 }
 310 """)
 311
 312 unop("find_lsb", tint32, """
 313 dst = -1;
 314 for (unsigned bit = 0; bit < 32; bit++) {
 315    if ((src0 >> bit) & 1) {
 316       dst = bit;
 317       break;
 318    }
 319 }
 320 """)
 321
 322
 323 for i in xrange(1, 5):
 324    for j in xrange(1, 5):
 325       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 326
 327 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 328    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 329
 330 def binop(name, ty, alg_props, const_expr):
 331    binop_convert(name, ty, ty, alg_props, const_expr)
 332
 333 def binop_compare(name, ty, alg_props, const_expr):
 334    binop_convert(name, tbool, ty, alg_props, const_expr)
 335
 336 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 337                 src2_type, const_expr):
 338    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 339           "", const_expr)
 340
 341 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 342                  reduce_expr, final_expr):
 343    def final(src):
 344       return final_expr.format(src= "(" + src + ")")
 345    def reduce_(src0, src1):
 346       return reduce_expr.format(src0=src0, src1=src1)
 347    def prereduce(src0, src1):
 348       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 349    src0 = prereduce("src0.x", "src1.x")
 350    src1 = prereduce("src0.y", "src1.y")
 351    src2 = prereduce("src0.z", "src1.z")
 352    src3 = prereduce("src0.w", "src1.w")
 353    opcode(name + "2", output_size, output_type,
 354           [2, 2], [src_type, src_type], commutative,
 355           final(reduce_(src0, src1)))
 356    opcode(name + "3", output_size, output_type,
 357           [3, 3], [src_type, src_type], commutative,
 358           final(reduce_(reduce_(src0, src1), src2)))
 359    opcode(name + "4", output_size, output_type,
 360           [4, 4], [src_type, src_type], commutative,
 361           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 362
 363 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 364 binop("iadd", tint, commutative + associative, "src0 + src1")
 365 binop("fsub", tfloat, "", "src0 - src1")
 366 binop("isub", tint, "", "src0 - src1")
 367
 368 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 369 # low 32-bits of signed/unsigned integer multiply
 370 binop("imul", tint, commutative + associative, "src0 * src1")
 371 # high 32-bits of signed integer multiply
 372 binop("imul_high", tint32, commutative,
 373       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 374 # high 32-bits of unsigned integer multiply
 375 binop("umul_high", tuint32, commutative,
 376       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 377
 378 binop("fdiv", tfloat, "", "src0 / src1")
 379 binop("idiv", tint, "", "src0 / src1")
 380 binop("udiv", tuint, "", "src0 / src1")
 381
 382 # returns a boolean representing the carry resulting from the addition of
 383 # the two unsigned arguments.
 384
 385 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 386
 387 # returns a boolean representing the borrow resulting from the subtraction
 388 # of the two unsigned arguments.
 389
 390 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 391
 392 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 393 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 394
 395 #
 396 # Comparisons
 397 #
 398
 399
 400 # these integer-aware comparisons return a boolean (0 or ~0)
 401
 402 binop_compare("flt", tfloat, "", "src0 < src1")
 403 binop_compare("fge", tfloat, "", "src0 >= src1")
 404 binop_compare("feq", tfloat, commutative, "src0 == src1")
 405 binop_compare("fne", tfloat, commutative, "src0 != src1")
 406 binop_compare("ilt", tint, "", "src0 < src1")
 407 binop_compare("ige", tint, "", "src0 >= src1")
 408 binop_compare("ieq", tint, commutative, "src0 == src1")
 409 binop_compare("ine", tint, commutative, "src0 != src1")
 410 binop_compare("ult", tuint, "", "src0 < src1")
 411 binop_compare("uge", tuint, "", "src0 >= src1")
 412
 413 # integer-aware GLSL-style comparisons that compare floats and ints
 414
 415 binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
 416              "{src0} && {src1}", "{src}")
 417 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
 418              "{src0} || {src1}", "{src}")
 419 binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
 420              "{src0} && {src1}", "{src}")
 421 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 422              "{src0} || {src1}", "{src}")
 423
 424 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 425
 426 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 427              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 428 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 429              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 430
 431 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 432 # and false respectively
 433
 434 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 435 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 436 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 437 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 438
 439
 440 binop("ishl", tint, "", "src0 << src1")
 441 binop("ishr", tint, "", "src0 >> src1")
 442 binop("ushr", tuint, "", "src0 >> src1")
 443
 444 # bitwise logic operators
 445 #
 446 # These are also used as boolean and, or, xor for hardware supporting
 447 # integers.
 448
 449
 450 binop("iand", tuint, commutative + associative, "src0 & src1")
 451 binop("ior", tuint, commutative + associative, "src0 | src1")
 452 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 453
 454
 455 # floating point logic operators
 456 #
 457 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 458 # for true and 0.0 for false
 459
 460 binop("fand", tfloat32, commutative,
 461       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 462 binop("for", tfloat32, commutative,
 463       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 464 binop("fxor", tfloat32, commutative,
 465       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 466
 467 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 468              "{src}")
 469
 470 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 471              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 472
 473 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 474        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 475 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 476        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 477
 478 binop("fmin", tfloat, "", "fminf(src0, src1)")
 479 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 480 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 481 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 482 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 483 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 484
 485 # Saturated vector add for 4 8bit ints.
 486 binop("usadd_4x8", tint32, commutative + associative, """
 487 dst = 0;
 488 for (int i = 0; i < 32; i += 8) {
 489    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 490 }
 491 """)
 492
 493 # Saturated vector subtract for 4 8bit ints.
 494 binop("ussub_4x8", tint32, "", """
 495 dst = 0;
 496 for (int i = 0; i < 32; i += 8) {
 497    int src0_chan = (src0 >> i) & 0xff;
 498    int src1_chan = (src1 >> i) & 0xff;
 499    if (src0_chan > src1_chan)
 500       dst |= (src0_chan - src1_chan) << i;
 501 }
 502 """)
 503
 504 # vector min for 4 8bit ints.
 505 binop("umin_4x8", tint32, commutative + associative, """
 506 dst = 0;
 507 for (int i = 0; i < 32; i += 8) {
 508    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 509 }
 510 """)
 511
 512 # vector max for 4 8bit ints.
 513 binop("umax_4x8", tint32, commutative + associative, """
 514 dst = 0;
 515 for (int i = 0; i < 32; i += 8) {
 516    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 517 }
 518 """)
 519
 520 # unorm multiply: (a * b) / 255.
 521 binop("umul_unorm_4x8", tint32, commutative + associative, """
 522 dst = 0;
 523 for (int i = 0; i < 32; i += 8) {
 524    int src0_chan = (src0 >> i) & 0xff;
 525    int src1_chan = (src1 >> i) & 0xff;
 526    dst |= ((src0_chan * src1_chan) / 255) << i;
 527 }
 528 """)
 529
 530 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 531
 532 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 533             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 534
 535 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 536 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 537 # if either of its arguments are 32.
 538 binop_convert("bfm", tuint32, tint32, "", """
 539 int bits = src0, offset = src1;
 540 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 541    dst = 0; /* undefined */
 542 else
 543    dst = ((1u << bits) - 1) << offset;
 544 """)
 545
 546 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
 547 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 548 /* flush denormals to zero. */
 549 if (!isnormal(dst))
 550    dst = copysignf(0.0f, src0);
 551 """)
 552
 553 # Combines the first component of each input to make a 2-component vector.
 554
 555 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 556 dst.x = src0.x;
 557 dst.y = src1.x;
 558 """)
 559
 560 # Byte extraction
 561 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 562 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 563
 564 # Word extraction
 565 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 566 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 567
 568
 569 def triop(name, ty, const_expr):
 570    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 571 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 572    opcode(name, output_size, tuint,
 573    [src1_size, src2_size, src3_size],
 574    [tuint, tuint, tuint], "", const_expr)
 575
 576 triop("ffma", tfloat, "src0 * src1 + src2")
 577
 578 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 579
 580 # Conditional Select
 581 #
 582 # A vector conditional select instruction (like ?:, but operating per-
 583 # component on vectors). There are two versions, one for floating point
 584 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 585
 586
 587 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 588 opcode("bcsel", 0, tuint, [0, 0, 0],
 589       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 590
 591 # SM5 bfi assembly
 592 triop("bfi", tuint32, """
 593 unsigned mask = src0, insert = src1, base = src2;
 594 if (mask == 0) {
 595    dst = base;
 596 } else {
 597    unsigned tmp = mask;
 598    while (!(tmp & 1)) {
 599       tmp >>= 1;
 600       insert <<= 1;
 601    }
 602    dst = (base & ~mask) | (insert & mask);
 603 }
 604 """)
 605
 606 # SM5 ubfe/ibfe assembly
 607 opcode("ubfe", 0, tuint32,
 608        [0, 0, 0], [tuint32, tint32, tint32], "", """
 609 unsigned base = src0;
 610 int offset = src1, bits = src2;
 611 if (bits == 0) {
 612    dst = 0;
 613 } else if (bits < 0 || offset < 0) {
 614    dst = 0; /* undefined */
 615 } else if (offset + bits < 32) {
 616    dst = (base << (32 - bits - offset)) >> (32 - bits);
 617 } else {
 618    dst = base >> offset;
 619 }
 620 """)
 621 opcode("ibfe", 0, tint32,
 622        [0, 0, 0], [tint32, tint32, tint32], "", """
 623 int base = src0;
 624 int offset = src1, bits = src2;
 625 if (bits == 0) {
 626    dst = 0;
 627 } else if (bits < 0 || offset < 0) {
 628    dst = 0; /* undefined */
 629 } else if (offset + bits < 32) {
 630    dst = (base << (32 - bits - offset)) >> (32 - bits);
 631 } else {
 632    dst = base >> offset;
 633 }
 634 """)
 635
 636 # GLSL bitfieldExtract()
 637 opcode("ubitfield_extract", 0, tuint32,
 638        [0, 0, 0], [tuint32, tint32, tint32], "", """
 639 unsigned base = src0;
 640 int offset = src1, bits = src2;
 641 if (bits == 0) {
 642    dst = 0;
 643 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 644    dst = 0; /* undefined per the spec */
 645 } else {
 646    dst = (base >> offset) & ((1ull << bits) - 1);
 647 }
 648 """)
 649 opcode("ibitfield_extract", 0, tint32,
 650        [0, 0, 0], [tint32, tint32, tint32], "", """
 651 int base = src0;
 652 int offset = src1, bits = src2;
 653 if (bits == 0) {
 654    dst = 0;
 655 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 656    dst = 0;
 657 } else {
 658    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 659 }
 660 """)
 661
 662 # Combines the first component of each input to make a 3-component vector.
 663
 664 triop_horiz("vec3", 3, 1, 1, 1, """
 665 dst.x = src0.x;
 666 dst.y = src1.x;
 667 dst.z = src2.x;
 668 """)
 669
 670 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 671                  src4_size, const_expr):
 672    opcode(name, output_size, tuint,
 673           [src1_size, src2_size, src3_size, src4_size],
 674           [tuint, tuint, tuint, tuint],
 675           "", const_expr)
 676
 677 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 678        [tuint32, tuint32, tint32, tint32], "", """
 679 unsigned base = src0, insert = src1;
 680 int offset = src2, bits = src3;
 681 if (bits == 0) {
 682    dst = 0;
 683 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 684    dst = 0;
 685 } else {
 686    unsigned mask = ((1ull << bits) - 1) << offset;
 687    dst = (base & ~mask) | ((insert << bits) & mask);
 688 }
 689 """)
 690
 691 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 692 dst.x = src0.x;
 693 dst.y = src1.x;
 694 dst.z = src2.x;
 695 dst.w = src3.x;
 696 """)
 697
 698