src/compiler/nir/nir_opcodes.py

   1 #! /usr/bin/env python
   2 #
   3 # Copyright (C) 2014 Connor Abbott
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a
   6 # copy of this software and associated documentation files (the "Software"),
   7 # to deal in the Software without restriction, including without limitation
   8 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 # and/or sell copies of the Software, and to permit persons to whom the
  10 # Software is furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice (including the next
  13 # paragraph) shall be included in all copies or substantial portions of the
  14 # Software.
  15 #
  16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22 # IN THE SOFTWARE.
  23 #
  24 # Authors:
  25 #    Connor Abbott (cwabbott0@gmail.com)
  26
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool32"
  94 tuint = "uint"
  95 tfloat32 = "float32"
  96 tint32 = "int32"
  97 tuint32 = "uint32"
  98 tint64 = "int64"
  99 tuint64 = "uint64"
 100 tfloat64 = "float64"
 101
 102 commutative = "commutative "
 103 associative = "associative "
 104
 105 # global dictionary of opcodes
 106 opcodes = {}
 107
 108 def opcode(name, output_size, output_type, input_sizes, input_types,
 109            algebraic_properties, const_expr):
 110    assert name not in opcodes
 111    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 112                           input_types, algebraic_properties, const_expr)
 113
 114 def unop_convert(name, out_type, in_type, const_expr):
 115    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 116
 117 def unop(name, ty, const_expr):
 118    opcode(name, 0, ty, [0], [ty], "", const_expr)
 119
 120 def unop_horiz(name, output_size, output_type, input_size, input_type,
 121                const_expr):
 122    opcode(name, output_size, output_type, [input_size], [input_type], "",
 123           const_expr)
 124
 125 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 126                 reduce_expr, final_expr):
 127    def prereduce(src):
 128       return "(" + prereduce_expr.format(src=src) + ")"
 129    def final(src):
 130       return final_expr.format(src="(" + src + ")")
 131    def reduce_(src0, src1):
 132       return reduce_expr.format(src0=src0, src1=src1)
 133    src0 = prereduce("src0.x")
 134    src1 = prereduce("src0.y")
 135    src2 = prereduce("src0.z")
 136    src3 = prereduce("src0.w")
 137    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 138               final(reduce_(src0, src1)))
 139    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 140               final(reduce_(reduce_(src0, src1), src2)))
 141    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 142               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 143
 144
 145 # These two move instructions differ in what modifiers they support and what
 146 # the negate modifier means. Otherwise, they are identical.
 147 unop("fmov", tfloat, "src0")
 148 unop("imov", tint, "src0")
 149
 150 unop("ineg", tint, "-src0")
 151 unop("fneg", tfloat, "-src0")
 152 unop("inot", tint, "~src0") # invert every bit of the integer
 153 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
 154                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
 155 unop("fsign", tfloat, ("bit_size == 64 ? " +
 156                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 157                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 158 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 159 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 160 unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
 161 unop("fsat", tfloat, ("bit_size == 64 ? " +
 162                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 163                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 164 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 165 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 166 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 167 unop("fexp2", tfloat, "exp2f(src0)")
 168 unop("flog2", tfloat, "log2f(src0)")
 169 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
 170 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
 171 unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
 172 unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
 173 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 174 unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
 175 unop_convert("i2i32", tint32, tint, "src0")    # General int (int8_t, int64_t, etc.) to int32_t conversion
 176 unop_convert("u2i32", tint32, tuint, "src0")   # General uint (uint8_t, uint64_t, etc.) to int32_t conversion
 177 unop_convert("i2u32", tuint32, tint, "src0")   # General int (int8_t, int64_t, etc.) to uint32_t conversion
 178 unop_convert("u2u32", tuint32, tuint, "src0")  # General uint (uint8_t, uint64_t, etc.) to uint32_t conversion
 179 unop_convert("i2i64", tint64, tint, "src0")    # General int (int8_t, int32_t, etc.) to int64_t conversion
 180 unop_convert("u2i64", tint64, tuint, "src0")   # General uint (uint8_t, uint64_t, etc.) to int64_t conversion
 181 unop_convert("f2i64", tint64, tfloat, "src0")  # General float (float or double) to int64_t conversion
 182 unop_convert("i2u64", tuint64, tint,  "src0")  # General int (int8_t, int64_t, etc.) to uint64_t conversion
 183 unop_convert("u2u64", tuint64, tuint, "src0")  # General uint (uint8_t, uint32_t, etc.) to uint64_t conversion
 184 unop_convert("f2u64", tuint64, tfloat, "src0") # General float (float or double) to uint64_t conversion
 185 unop_convert("i642f", tfloat32, tint64, "src0")  # int64_t-to-float conversion.
 186 unop_convert("i642d", tfloat64, tint64, "src0")  # int64_t-to-double conversion.
 187 unop_convert("u642f", tfloat32, tuint64, "src0") # uint64_t-to-float conversion.
 188 unop_convert("u642d", tfloat64, tuint64, "src0") # uint64_t-to-double conversion.
 189
 190 # Float-to-boolean conversion
 191 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 192 unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
 193 # Boolean-to-float conversion
 194 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 195 # Int-to-boolean conversion
 196 unop_convert("i2b", tbool, tint, "src0 != 0")
 197 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
 198 unop_convert("b2i64", tint64, tbool, "src0 ? 1 : 0")  # Boolean-to-int64_t conversion.
 199 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 200 unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
 201 # double-to-float conversion
 202 unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
 203 unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
 204
 205 # Unary floating-point rounding operations.
 206
 207
 208 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 209 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 210 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 211 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 212 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 213
 214 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 215
 216 # Trigonometric operations.
 217
 218
 219 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 220 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 221
 222
 223 # Partial derivatives.
 224
 225
 226 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 227 unop("fddy", tfloat, "0.0")
 228 unop("fddx_fine", tfloat, "0.0")
 229 unop("fddy_fine", tfloat, "0.0")
 230 unop("fddx_coarse", tfloat, "0.0")
 231 unop("fddy_coarse", tfloat, "0.0")
 232
 233
 234 # Floating point pack and unpack operations.
 235
 236 def pack_2x16(fmt):
 237    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 238 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 239 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 240 """.replace("fmt", fmt))
 241
 242 def pack_4x8(fmt):
 243    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 244 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 245 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 246 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 247 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 248 """.replace("fmt", fmt))
 249
 250 def unpack_2x16(fmt):
 251    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 252 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 253 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 254 """.replace("fmt", fmt))
 255
 256 def unpack_4x8(fmt):
 257    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 258 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 259 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 260 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 261 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 262 """.replace("fmt", fmt))
 263
 264
 265 pack_2x16("snorm")
 266 pack_4x8("snorm")
 267 pack_2x16("unorm")
 268 pack_4x8("unorm")
 269 pack_2x16("half")
 270 unpack_2x16("snorm")
 271 unpack_4x8("snorm")
 272 unpack_2x16("unorm")
 273 unpack_4x8("unorm")
 274 unpack_2x16("half")
 275
 276 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 277 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 278 """)
 279
 280 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 281 dst.x = (src0.x <<  0) |
 282         (src0.y <<  8) |
 283         (src0.z << 16) |
 284         (src0.w << 24);
 285 """)
 286
 287 unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
 288            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 289
 290 unop_horiz("pack_int_2x32", 1, tint64, 2, tint32,
 291            "dst.x = src0.x | ((int64_t)src0.y << 32);")
 292
 293 unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
 294            "dst.x = src0.x; dst.y = src0.x >> 32;")
 295
 296 unop_horiz("unpack_int_2x32", 2, tint32, 1, tint64,
 297            "dst.x = src0.x; dst.y = src0.x >> 32;")
 298
 299 # Lowered floating point unpacking operations.
 300
 301
 302 unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
 303            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
 304 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
 305            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 306
 307 unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
 308 unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 309 unop_convert("unpack_int_2x32_split_x", tuint32, tuint64, "src0")
 310 unop_convert("unpack_int_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 311
 312 # Bit operations, part of ARB_gpu_shader5.
 313
 314
 315 unop("bitfield_reverse", tuint32, """
 316 /* we're not winning any awards for speed here, but that's ok */
 317 dst = 0;
 318 for (unsigned bit = 0; bit < 32; bit++)
 319    dst |= ((src0 >> bit) & 1) << (31 - bit);
 320 """)
 321 unop("bit_count", tuint32, """
 322 dst = 0;
 323 for (unsigned bit = 0; bit < 32; bit++) {
 324    if ((src0 >> bit) & 1)
 325       dst++;
 326 }
 327 """)
 328
 329 unop_convert("ufind_msb", tint32, tuint32, """
 330 dst = -1;
 331 for (int bit = 31; bit > 0; bit--) {
 332    if ((src0 >> bit) & 1) {
 333       dst = bit;
 334       break;
 335    }
 336 }
 337 """)
 338
 339 unop("ifind_msb", tint32, """
 340 dst = -1;
 341 for (int bit = 31; bit >= 0; bit--) {
 342    /* If src0 < 0, we're looking for the first 0 bit.
 343     * if src0 >= 0, we're looking for the first 1 bit.
 344     */
 345    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 346       (!((src0 >> bit) & 1) && (src0 < 0))) {
 347       dst = bit;
 348       break;
 349    }
 350 }
 351 """)
 352
 353 unop("find_lsb", tint32, """
 354 dst = -1;
 355 for (unsigned bit = 0; bit < 32; bit++) {
 356    if ((src0 >> bit) & 1) {
 357       dst = bit;
 358       break;
 359    }
 360 }
 361 """)
 362
 363
 364 for i in xrange(1, 5):
 365    for j in xrange(1, 5):
 366       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 367
 368 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 369    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 370
 371 def binop(name, ty, alg_props, const_expr):
 372    binop_convert(name, ty, ty, alg_props, const_expr)
 373
 374 def binop_compare(name, ty, alg_props, const_expr):
 375    binop_convert(name, tbool, ty, alg_props, const_expr)
 376
 377 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 378                 src2_type, const_expr):
 379    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 380           "", const_expr)
 381
 382 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 383                  reduce_expr, final_expr):
 384    def final(src):
 385       return final_expr.format(src= "(" + src + ")")
 386    def reduce_(src0, src1):
 387       return reduce_expr.format(src0=src0, src1=src1)
 388    def prereduce(src0, src1):
 389       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 390    src0 = prereduce("src0.x", "src1.x")
 391    src1 = prereduce("src0.y", "src1.y")
 392    src2 = prereduce("src0.z", "src1.z")
 393    src3 = prereduce("src0.w", "src1.w")
 394    opcode(name + "2", output_size, output_type,
 395           [2, 2], [src_type, src_type], commutative,
 396           final(reduce_(src0, src1)))
 397    opcode(name + "3", output_size, output_type,
 398           [3, 3], [src_type, src_type], commutative,
 399           final(reduce_(reduce_(src0, src1), src2)))
 400    opcode(name + "4", output_size, output_type,
 401           [4, 4], [src_type, src_type], commutative,
 402           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 403
 404 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 405 binop("iadd", tint, commutative + associative, "src0 + src1")
 406 binop("fsub", tfloat, "", "src0 - src1")
 407 binop("isub", tint, "", "src0 - src1")
 408
 409 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 410 # low 32-bits of signed/unsigned integer multiply
 411 binop("imul", tint, commutative + associative, "src0 * src1")
 412 # high 32-bits of signed integer multiply
 413 binop("imul_high", tint32, commutative,
 414       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 415 # high 32-bits of unsigned integer multiply
 416 binop("umul_high", tuint32, commutative,
 417       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 418
 419 binop("fdiv", tfloat, "", "src0 / src1")
 420 binop("idiv", tint, "", "src0 / src1")
 421 binop("udiv", tuint, "", "src0 / src1")
 422
 423 # returns a boolean representing the carry resulting from the addition of
 424 # the two unsigned arguments.
 425
 426 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 427
 428 # returns a boolean representing the borrow resulting from the subtraction
 429 # of the two unsigned arguments.
 430
 431 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 432
 433 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 434
 435 # For signed integers, there are several different possible definitions of
 436 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 437 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 438 # operation while the imod opcode implements the more mathematical
 439 # "modulus" operation.  For details on the difference, see
 440 #
 441 # http://mathforum.org/library/drmath/view/52343.html
 442
 443 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 444 binop("imod", tint, "",
 445       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 446       "                 src0 % src1 : src0 % src1 + src1)")
 447 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 448 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 449
 450 #
 451 # Comparisons
 452 #
 453
 454
 455 # these integer-aware comparisons return a boolean (0 or ~0)
 456
 457 binop_compare("flt", tfloat, "", "src0 < src1")
 458 binop_compare("fge", tfloat, "", "src0 >= src1")
 459 binop_compare("feq", tfloat, commutative, "src0 == src1")
 460 binop_compare("fne", tfloat, commutative, "src0 != src1")
 461 binop_compare("ilt", tint, "", "src0 < src1")
 462 binop_compare("ige", tint, "", "src0 >= src1")
 463 binop_compare("ieq", tint, commutative, "src0 == src1")
 464 binop_compare("ine", tint, commutative, "src0 != src1")
 465 binop_compare("ult", tuint, "", "src0 < src1")
 466 binop_compare("uge", tuint, "", "src0 >= src1")
 467
 468 # integer-aware GLSL-style comparisons that compare floats and ints
 469
 470 binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
 471              "{src0} && {src1}", "{src}")
 472 binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
 473              "{src0} || {src1}", "{src}")
 474 binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
 475              "{src0} && {src1}", "{src}")
 476 binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 477              "{src0} || {src1}", "{src}")
 478
 479 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 480
 481 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 482              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 483 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 484              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 485
 486 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 487 # and false respectively
 488
 489 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 490 binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 491 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 492 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 493
 494
 495 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
 496 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
 497 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
 498
 499 # bitwise logic operators
 500 #
 501 # These are also used as boolean and, or, xor for hardware supporting
 502 # integers.
 503
 504
 505 binop("iand", tuint, commutative + associative, "src0 & src1")
 506 binop("ior", tuint, commutative + associative, "src0 | src1")
 507 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 508
 509
 510 # floating point logic operators
 511 #
 512 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 513 # for true and 0.0 for false
 514
 515 binop("fand", tfloat32, commutative,
 516       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 517 binop("for", tfloat32, commutative,
 518       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 519 binop("fxor", tfloat32, commutative,
 520       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 521
 522 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 523              "{src}")
 524
 525 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 526              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 527
 528 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 529        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 530 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 531        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 532
 533 binop("fmin", tfloat, "", "fminf(src0, src1)")
 534 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 535 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 536 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 537 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 538 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 539
 540 # Saturated vector add for 4 8bit ints.
 541 binop("usadd_4x8", tint32, commutative + associative, """
 542 dst = 0;
 543 for (int i = 0; i < 32; i += 8) {
 544    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 545 }
 546 """)
 547
 548 # Saturated vector subtract for 4 8bit ints.
 549 binop("ussub_4x8", tint32, "", """
 550 dst = 0;
 551 for (int i = 0; i < 32; i += 8) {
 552    int src0_chan = (src0 >> i) & 0xff;
 553    int src1_chan = (src1 >> i) & 0xff;
 554    if (src0_chan > src1_chan)
 555       dst |= (src0_chan - src1_chan) << i;
 556 }
 557 """)
 558
 559 # vector min for 4 8bit ints.
 560 binop("umin_4x8", tint32, commutative + associative, """
 561 dst = 0;
 562 for (int i = 0; i < 32; i += 8) {
 563    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 564 }
 565 """)
 566
 567 # vector max for 4 8bit ints.
 568 binop("umax_4x8", tint32, commutative + associative, """
 569 dst = 0;
 570 for (int i = 0; i < 32; i += 8) {
 571    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 572 }
 573 """)
 574
 575 # unorm multiply: (a * b) / 255.
 576 binop("umul_unorm_4x8", tint32, commutative + associative, """
 577 dst = 0;
 578 for (int i = 0; i < 32; i += 8) {
 579    int src0_chan = (src0 >> i) & 0xff;
 580    int src1_chan = (src1 >> i) & 0xff;
 581    dst |= ((src0_chan * src1_chan) / 255) << i;
 582 }
 583 """)
 584
 585 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 586
 587 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 588             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 589
 590 binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
 591               "src0 | ((uint64_t)src1 << 32)")
 592
 593 binop_convert("pack_int_2x32_split", tuint64, tuint32, "",
 594               "src0 | ((uint64_t)src1 << 32)")
 595
 596 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 597 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 598 # if either of its arguments are 32.
 599 binop_convert("bfm", tuint32, tint32, "", """
 600 int bits = src0, offset = src1;
 601 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 602    dst = 0; /* undefined */
 603 else
 604    dst = ((1u << bits) - 1) << offset;
 605 """)
 606
 607 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
 608 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 609 /* flush denormals to zero. */
 610 if (!isnormal(dst))
 611    dst = copysignf(0.0f, src0);
 612 """)
 613
 614 # Combines the first component of each input to make a 2-component vector.
 615
 616 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 617 dst.x = src0.x;
 618 dst.y = src1.x;
 619 """)
 620
 621 # Byte extraction
 622 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 623 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 624
 625 # Word extraction
 626 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 627 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 628
 629
 630 def triop(name, ty, const_expr):
 631    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 632 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 633    opcode(name, output_size, tuint,
 634    [src1_size, src2_size, src3_size],
 635    [tuint, tuint, tuint], "", const_expr)
 636
 637 triop("ffma", tfloat, "src0 * src1 + src2")
 638
 639 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 640
 641 # Conditional Select
 642 #
 643 # A vector conditional select instruction (like ?:, but operating per-
 644 # component on vectors). There are two versions, one for floating point
 645 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 646
 647
 648 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 649 opcode("bcsel", 0, tuint, [0, 0, 0],
 650       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 651
 652 # SM5 bfi assembly
 653 triop("bfi", tuint32, """
 654 unsigned mask = src0, insert = src1, base = src2;
 655 if (mask == 0) {
 656    dst = base;
 657 } else {
 658    unsigned tmp = mask;
 659    while (!(tmp & 1)) {
 660       tmp >>= 1;
 661       insert <<= 1;
 662    }
 663    dst = (base & ~mask) | (insert & mask);
 664 }
 665 """)
 666
 667 # SM5 ubfe/ibfe assembly
 668 opcode("ubfe", 0, tuint32,
 669        [0, 0, 0], [tuint32, tint32, tint32], "", """
 670 unsigned base = src0;
 671 int offset = src1, bits = src2;
 672 if (bits == 0) {
 673    dst = 0;
 674 } else if (bits < 0 || offset < 0) {
 675    dst = 0; /* undefined */
 676 } else if (offset + bits < 32) {
 677    dst = (base << (32 - bits - offset)) >> (32 - bits);
 678 } else {
 679    dst = base >> offset;
 680 }
 681 """)
 682 opcode("ibfe", 0, tint32,
 683        [0, 0, 0], [tint32, tint32, tint32], "", """
 684 int base = src0;
 685 int offset = src1, bits = src2;
 686 if (bits == 0) {
 687    dst = 0;
 688 } else if (bits < 0 || offset < 0) {
 689    dst = 0; /* undefined */
 690 } else if (offset + bits < 32) {
 691    dst = (base << (32 - bits - offset)) >> (32 - bits);
 692 } else {
 693    dst = base >> offset;
 694 }
 695 """)
 696
 697 # GLSL bitfieldExtract()
 698 opcode("ubitfield_extract", 0, tuint32,
 699        [0, 0, 0], [tuint32, tint32, tint32], "", """
 700 unsigned base = src0;
 701 int offset = src1, bits = src2;
 702 if (bits == 0) {
 703    dst = 0;
 704 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 705    dst = 0; /* undefined per the spec */
 706 } else {
 707    dst = (base >> offset) & ((1ull << bits) - 1);
 708 }
 709 """)
 710 opcode("ibitfield_extract", 0, tint32,
 711        [0, 0, 0], [tint32, tint32, tint32], "", """
 712 int base = src0;
 713 int offset = src1, bits = src2;
 714 if (bits == 0) {
 715    dst = 0;
 716 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 717    dst = 0;
 718 } else {
 719    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 720 }
 721 """)
 722
 723 # Combines the first component of each input to make a 3-component vector.
 724
 725 triop_horiz("vec3", 3, 1, 1, 1, """
 726 dst.x = src0.x;
 727 dst.y = src1.x;
 728 dst.z = src2.x;
 729 """)
 730
 731 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 732                  src4_size, const_expr):
 733    opcode(name, output_size, tuint,
 734           [src1_size, src2_size, src3_size, src4_size],
 735           [tuint, tuint, tuint, tuint],
 736           "", const_expr)
 737
 738 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 739        [tuint32, tuint32, tint32, tint32], "", """
 740 unsigned base = src0, insert = src1;
 741 int offset = src2, bits = src3;
 742 if (bits == 0) {
 743    dst = 0;
 744 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 745    dst = 0;
 746 } else {
 747    unsigned mask = ((1ull << bits) - 1) << offset;
 748    dst = (base & ~mask) | ((insert << bits) & mask);
 749 }
 750 """)
 751
 752 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 753 dst.x = src0.x;
 754 dst.y = src1.x;
 755 dst.z = src2.x;
 756 dst.w = src3.x;
 757 """)
 758
 759