src/compiler/nir/nir_opcodes.py

   1 #! /usr/bin/env python
   2 #
   3 # Copyright (C) 2014 Connor Abbott
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a
   6 # copy of this software and associated documentation files (the "Software"),
   7 # to deal in the Software without restriction, including without limitation
   8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 # and/or sell copies of the Software, and to permit persons to whom the
  10 # Software is furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice (including the next
  13 # paragraph) shall be included in all copies or substantial portions of the
  14 # Software.
  15 #
  16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22 # IN THE SOFTWARE.
  23 #
  24 # Authors:
  25 #    Connor Abbott (cwabbott0@gmail.com)
  26
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool32"
  94 tuint = "uint"
  95 tfloat32 = "float32"
  96 tint32 = "int32"
  97 tuint32 = "uint32"
  98 tfloat64 = "float64"
  99
 100 commutative = "commutative "
 101 associative = "associative "
 102
 103 # global dictionary of opcodes
 104 opcodes = {}
 105
 106 def opcode(name, output_size, output_type, input_sizes, input_types,
 107            algebraic_properties, const_expr):
 108    assert name not in opcodes
 109    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 110                           input_types, algebraic_properties, const_expr)
 111
 112 def unop_convert(name, out_type, in_type, const_expr):
 113    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 114
 115 def unop(name, ty, const_expr):
 116    opcode(name, 0, ty, [0], [ty], "", const_expr)
 117
 118 def unop_horiz(name, output_size, output_type, input_size, input_type,
 119                const_expr):
 120    opcode(name, output_size, output_type, [input_size], [input_type], "",
 121           const_expr)
 122
 123 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 124                 reduce_expr, final_expr):
 125    def prereduce(src):
 126       return "(" + prereduce_expr.format(src=src) + ")"
 127    def final(src):
 128       return final_expr.format(src="(" + src + ")")
 129    def reduce_(src0, src1):
 130       return reduce_expr.format(src0=src0, src1=src1)
 131    src0 = prereduce("src0.x")
 132    src1 = prereduce("src0.y")
 133    src2 = prereduce("src0.z")
 134    src3 = prereduce("src0.w")
 135    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 136               final(reduce_(src0, src1)))
 137    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 138               final(reduce_(reduce_(src0, src1), src2)))
 139    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 140               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 141
 142
 143 # These two move instructions differ in what modifiers they support and what
 144 # the negate modifier means. Otherwise, they are identical.
 145 unop("fmov", tfloat, "src0")
 146 unop("imov", tint, "src0")
 147
 148 unop("ineg", tint, "-src0")
 149 unop("fneg", tfloat, "-src0")
 150 unop("inot", tint, "~src0") # invert every bit of the integer
 151 unop("fnot", tfloat, "(src0 == 0.0f) ? 1.0f : 0.0f")
 152 unop("fsign", tfloat, "(src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f)")
 153 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 154 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 155 unop("fabs", tfloat, "fabsf(src0)")
 156 unop("fsat", tfloat, "(src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0)")
 157 unop("frcp", tfloat, "1.0f / src0")
 158 unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 159 unop("fsqrt", tfloat, "sqrtf(src0)")
 160 unop("fexp2", tfloat, "exp2f(src0)")
 161 unop("flog2", tfloat, "log2f(src0)")
 162 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
 163 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
 164 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 165 # Float-to-boolean conversion
 166 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 167 # Boolean-to-float conversion
 168 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 169 # Int-to-boolean conversion
 170 unop_convert("i2b", tbool, tint32, "src0 != 0")
 171 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
 172 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 173
 174 # Unary floating-point rounding operations.
 175
 176
 177 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 178 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 179 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 180 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 181 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 182
 183
 184 # Trigonometric operations.
 185
 186
 187 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 188 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 189
 190
 191 # Partial derivatives.
 192
 193
 194 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 195 unop("fddy", tfloat, "0.0")
 196 unop("fddx_fine", tfloat, "0.0")
 197 unop("fddy_fine", tfloat, "0.0")
 198 unop("fddx_coarse", tfloat, "0.0")
 199 unop("fddy_coarse", tfloat, "0.0")
 200
 201
 202 # Floating point pack and unpack operations.
 203
 204 def pack_2x16(fmt):
 205    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 206 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 207 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 208 """.replace("fmt", fmt))
 209
 210 def pack_4x8(fmt):
 211    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 212 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 213 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 214 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 215 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 216 """.replace("fmt", fmt))
 217
 218 def unpack_2x16(fmt):
 219    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 220 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 221 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 222 """.replace("fmt", fmt))
 223
 224 def unpack_4x8(fmt):
 225    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 226 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 227 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 228 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 229 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 230 """.replace("fmt", fmt))
 231
 232
 233 pack_2x16("snorm")
 234 pack_4x8("snorm")
 235 pack_2x16("unorm")
 236 pack_4x8("unorm")
 237 pack_2x16("half")
 238 unpack_2x16("snorm")
 239 unpack_4x8("snorm")
 240 unpack_2x16("unorm")
 241 unpack_4x8("unorm")
 242 unpack_2x16("half")
 243
 244 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 245 dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 246 """)
 247
 248 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 249 dst.x = (src0.x <<  0) |
 250         (src0.y <<  8) |
 251         (src0.z << 16) |
 252         (src0.w << 24);
 253 """)
 254
 255 # Lowered floating point unpacking operations.
 256
 257
 258 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
 259            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
 260 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
 261            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 262
 263
 264 # Bit operations, part of ARB_gpu_shader5.
 265
 266
 267 unop("bitfield_reverse", tuint32, """
 268 /* we're not winning any awards for speed here, but that's ok */
 269 dst = 0;
 270 for (unsigned bit = 0; bit < 32; bit++)
 271    dst |= ((src0 >> bit) & 1) << (31 - bit);
 272 """)
 273 unop("bit_count", tuint32, """
 274 dst = 0;
 275 for (unsigned bit = 0; bit < 32; bit++) {
 276    if ((src0 >> bit) & 1)
 277       dst++;
 278 }
 279 """)
 280
 281 unop_convert("ufind_msb", tint32, tuint32, """
 282 dst = -1;
 283 for (int bit = 31; bit > 0; bit--) {
 284    if ((src0 >> bit) & 1) {
 285       dst = bit;
 286       break;
 287    }
 288 }
 289 """)
 290
 291 unop("ifind_msb", tint32, """
 292 dst = -1;
 293 for (int bit = 31; bit >= 0; bit--) {
 294    /* If src0 < 0, we're looking for the first 0 bit.
 295     * if src0 >= 0, we're looking for the first 1 bit.
 296     */
 297    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 298       (!((src0 >> bit) & 1) && (src0 < 0))) {
 299       dst = bit;
 300       break;
 301    }
 302 }
 303 """)
 304
 305 unop("find_lsb", tint32, """
 306 dst = -1;
 307 for (unsigned bit = 0; bit < 32; bit++) {
 308    if ((src0 >> bit) & 1) {
 309       dst = bit;
 310       break;
 311    }
 312 }
 313 """)
 314
 315
 316 for i in xrange(1, 5):
 317    for j in xrange(1, 5):
 318       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 319
 320 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 321    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 322
 323 def binop(name, ty, alg_props, const_expr):
 324    binop_convert(name, ty, ty, alg_props, const_expr)
 325
 326 def binop_compare(name, ty, alg_props, const_expr):
 327    binop_convert(name, tbool, ty, alg_props, const_expr)
 328
 329 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 330                 src2_type, const_expr):
 331    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 332           "", const_expr)
 333
 334 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 335                  reduce_expr, final_expr):
 336    def final(src):
 337       return final_expr.format(src= "(" + src + ")")
 338    def reduce_(src0, src1):
 339       return reduce_expr.format(src0=src0, src1=src1)
 340    def prereduce(src0, src1):
 341       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 342    src0 = prereduce("src0.x", "src1.x")
 343    src1 = prereduce("src0.y", "src1.y")
 344    src2 = prereduce("src0.z", "src1.z")
 345    src3 = prereduce("src0.w", "src1.w")
 346    opcode(name + "2", output_size, output_type,
 347           [2, 2], [src_type, src_type], commutative,
 348           final(reduce_(src0, src1)))
 349    opcode(name + "3", output_size, output_type,
 350           [3, 3], [src_type, src_type], commutative,
 351           final(reduce_(reduce_(src0, src1), src2)))
 352    opcode(name + "4", output_size, output_type,
 353           [4, 4], [src_type, src_type], commutative,
 354           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 355
 356 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 357 binop("iadd", tint, commutative + associative, "src0 + src1")
 358 binop("fsub", tfloat, "", "src0 - src1")
 359 binop("isub", tint, "", "src0 - src1")
 360
 361 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 362 # low 32-bits of signed/unsigned integer multiply
 363 binop("imul", tint, commutative + associative, "src0 * src1")
 364 # high 32-bits of signed integer multiply
 365 binop("imul_high", tint32, commutative,
 366       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 367 # high 32-bits of unsigned integer multiply
 368 binop("umul_high", tuint32, commutative,
 369       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 370
 371 binop("fdiv", tfloat, "", "src0 / src1")
 372 binop("idiv", tint, "", "src0 / src1")
 373 binop("udiv", tuint, "", "src0 / src1")
 374
 375 # returns a boolean representing the carry resulting from the addition of
 376 # the two unsigned arguments.
 377
 378 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 379
 380 # returns a boolean representing the borrow resulting from the subtraction
 381 # of the two unsigned arguments.
 382
 383 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 384
 385 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 386 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 387
 388 #
 389 # Comparisons
 390 #
 391
 392
 393 # these integer-aware comparisons return a boolean (0 or ~0)
 394
 395 binop_compare("flt", tfloat, "", "src0 < src1")
 396 binop_compare("fge", tfloat, "", "src0 >= src1")
 397 binop_compare("feq", tfloat, commutative, "src0 == src1")
 398 binop_compare("fne", tfloat, commutative, "src0 != src1")
 399 binop_compare("ilt", tint, "", "src0 < src1")
 400 binop_compare("ige", tint, "", "src0 >= src1")
 401 binop_compare("ieq", tint, commutative, "src0 == src1")
 402 binop_compare("ine", tint, commutative, "src0 != src1")
 403 binop_compare("ult", tuint, "", "src0 < src1")
 404 binop_compare("uge", tuint, "", "src0 >= src1")
 405
 406 # integer-aware GLSL-style comparisons that compare floats and ints
 407
 408 binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
 409              "{src0} && {src1}", "{src}")
 410 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
 411              "{src0} || {src1}", "{src}")
 412 binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
 413              "{src0} && {src1}", "{src}")
 414 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 415              "{src0} || {src1}", "{src}")
 416
 417 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 418
 419 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 420              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 421 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 422              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 423
 424 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 425 # and false respectively
 426
 427 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 428 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 429 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 430 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 431
 432
 433 binop("ishl", tint, "", "src0 << src1")
 434 binop("ishr", tint, "", "src0 >> src1")
 435 binop("ushr", tuint, "", "src0 >> src1")
 436
 437 # bitwise logic operators
 438 #
 439 # These are also used as boolean and, or, xor for hardware supporting
 440 # integers.
 441
 442
 443 binop("iand", tuint, commutative + associative, "src0 & src1")
 444 binop("ior", tuint, commutative + associative, "src0 | src1")
 445 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 446
 447
 448 # floating point logic operators
 449 #
 450 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 451 # for true and 0.0 for false
 452
 453 binop("fand", tfloat32, commutative,
 454       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 455 binop("for", tfloat32, commutative,
 456       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 457 binop("fxor", tfloat32, commutative,
 458       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 459
 460 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 461              "{src}")
 462
 463 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 464              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 465
 466 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 467        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 468 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 469        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 470
 471 binop("fmin", tfloat, "", "fminf(src0, src1)")
 472 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 473 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 474 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 475 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 476 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 477
 478 # Saturated vector add for 4 8bit ints.
 479 binop("usadd_4x8", tint32, commutative + associative, """
 480 dst = 0;
 481 for (int i = 0; i < 32; i += 8) {
 482    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 483 }
 484 """)
 485
 486 # Saturated vector subtract for 4 8bit ints.
 487 binop("ussub_4x8", tint32, "", """
 488 dst = 0;
 489 for (int i = 0; i < 32; i += 8) {
 490    int src0_chan = (src0 >> i) & 0xff;
 491    int src1_chan = (src1 >> i) & 0xff;
 492    if (src0_chan > src1_chan)
 493       dst |= (src0_chan - src1_chan) << i;
 494 }
 495 """)
 496
 497 # vector min for 4 8bit ints.
 498 binop("umin_4x8", tint32, commutative + associative, """
 499 dst = 0;
 500 for (int i = 0; i < 32; i += 8) {
 501    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 502 }
 503 """)
 504
 505 # vector max for 4 8bit ints.
 506 binop("umax_4x8", tint32, commutative + associative, """
 507 dst = 0;
 508 for (int i = 0; i < 32; i += 8) {
 509    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 510 }
 511 """)
 512
 513 # unorm multiply: (a * b) / 255.
 514 binop("umul_unorm_4x8", tint32, commutative + associative, """
 515 dst = 0;
 516 for (int i = 0; i < 32; i += 8) {
 517    int src0_chan = (src0 >> i) & 0xff;
 518    int src1_chan = (src1 >> i) & 0xff;
 519    dst |= ((src0_chan * src1_chan) / 255) << i;
 520 }
 521 """)
 522
 523 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 524
 525 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 526             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 527
 528 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 529 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 530 # if either of its arguments are 32.
 531 binop_convert("bfm", tuint32, tint32, "", """
 532 int bits = src0, offset = src1;
 533 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 534    dst = 0; /* undefined */
 535 else
 536    dst = ((1u << bits) - 1) << offset;
 537 """)
 538
 539 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
 540 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 541 /* flush denormals to zero. */
 542 if (!isnormal(dst))
 543    dst = copysignf(0.0f, src0);
 544 """)
 545
 546 # Combines the first component of each input to make a 2-component vector.
 547
 548 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 549 dst.x = src0.x;
 550 dst.y = src1.x;
 551 """)
 552
 553 # Byte extraction
 554 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 555 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 556
 557 # Word extraction
 558 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 559 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 560
 561
 562 def triop(name, ty, const_expr):
 563    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 564 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 565    opcode(name, output_size, tuint,
 566    [src1_size, src2_size, src3_size],
 567    [tuint, tuint, tuint], "", const_expr)
 568
 569 triop("ffma", tfloat, "src0 * src1 + src2")
 570
 571 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 572
 573 # Conditional Select
 574 #
 575 # A vector conditional select instruction (like ?:, but operating per-
 576 # component on vectors). There are two versions, one for floating point
 577 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 578
 579
 580 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 581 opcode("bcsel", 0, tuint, [0, 0, 0],
 582       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 583
 584 # SM5 bfi assembly
 585 triop("bfi", tuint32, """
 586 unsigned mask = src0, insert = src1, base = src2;
 587 if (mask == 0) {
 588    dst = base;
 589 } else {
 590    unsigned tmp = mask;
 591    while (!(tmp & 1)) {
 592       tmp >>= 1;
 593       insert <<= 1;
 594    }
 595    dst = (base & ~mask) | (insert & mask);
 596 }
 597 """)
 598
 599 # SM5 ubfe/ibfe assembly
 600 opcode("ubfe", 0, tuint32,
 601        [0, 0, 0], [tuint32, tint32, tint32], "", """
 602 unsigned base = src0;
 603 int offset = src1, bits = src2;
 604 if (bits == 0) {
 605    dst = 0;
 606 } else if (bits < 0 || offset < 0) {
 607    dst = 0; /* undefined */
 608 } else if (offset + bits < 32) {
 609    dst = (base << (32 - bits - offset)) >> (32 - bits);
 610 } else {
 611    dst = base >> offset;
 612 }
 613 """)
 614 opcode("ibfe", 0, tint32,
 615        [0, 0, 0], [tint32, tint32, tint32], "", """
 616 int base = src0;
 617 int offset = src1, bits = src2;
 618 if (bits == 0) {
 619    dst = 0;
 620 } else if (bits < 0 || offset < 0) {
 621    dst = 0; /* undefined */
 622 } else if (offset + bits < 32) {
 623    dst = (base << (32 - bits - offset)) >> (32 - bits);
 624 } else {
 625    dst = base >> offset;
 626 }
 627 """)
 628
 629 # GLSL bitfieldExtract()
 630 opcode("ubitfield_extract", 0, tuint32,
 631        [0, 0, 0], [tuint32, tint32, tint32], "", """
 632 unsigned base = src0;
 633 int offset = src1, bits = src2;
 634 if (bits == 0) {
 635    dst = 0;
 636 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 637    dst = 0; /* undefined per the spec */
 638 } else {
 639    dst = (base >> offset) & ((1ull << bits) - 1);
 640 }
 641 """)
 642 opcode("ibitfield_extract", 0, tint32,
 643        [0, 0, 0], [tint32, tint32, tint32], "", """
 644 int base = src0;
 645 int offset = src1, bits = src2;
 646 if (bits == 0) {
 647    dst = 0;
 648 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 649    dst = 0;
 650 } else {
 651    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 652 }
 653 """)
 654
 655 # Combines the first component of each input to make a 3-component vector.
 656
 657 triop_horiz("vec3", 3, 1, 1, 1, """
 658 dst.x = src0.x;
 659 dst.y = src1.x;
 660 dst.z = src2.x;
 661 """)
 662
 663 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 664                  src4_size, const_expr):
 665    opcode(name, output_size, tuint,
 666           [src1_size, src2_size, src3_size, src4_size],
 667           [tuint, tuint, tuint, tuint],
 668           "", const_expr)
 669
 670 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 671        [tuint32, tuint32, tint32, tint32], "", """
 672 unsigned base = src0, insert = src1;
 673 int offset = src2, bits = src3;
 674 if (bits == 0) {
 675    dst = 0;
 676 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 677    dst = 0;
 678 } else {
 679    unsigned mask = ((1ull << bits) - 1) << offset;
 680    dst = (base & ~mask) | ((insert << bits) & mask);
 681 }
 682 """)
 683
 684 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 685 dst.x = src0.x;
 686 dst.y = src1.x;
 687 dst.z = src2.x;
 688 dst.w = src3.x;
 689 """)
 690
 691