src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  43         prepended before each entry
  44       - const_expr is an expression or series of statements that computes the
  45         constant value of the opcode given the constant values of its inputs.
  46
  47       Constant expressions are formed from the variables src0, src1, ...,
  48       src(N-1), where N is the number of arguments.  The output of the
  49       expression should be stored in the dst variable.  Per-component input
  50       and output variables will be scalars and non-per-component input and
  51       output variables will be a struct with fields named x, y, z, and w
  52       all of the correct type.  Input and output variables can be assumed
  53       to already be of the correct type and need no conversion.  In
  54       particular, the conversion from the C bool type to/from  NIR_TRUE and
  55       NIR_FALSE happens automatically.
  56
  57       For per-component instructions, the entire expression will be
  58       executed once for each component.  For non-per-component
  59       instructions, the expression is expected to store the correct values
  60       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  61       constant expression, an assignment to dst will happen automatically
  62       and the result will be equivalent to "dst = <expression>" for
  63       per-component instructions and "dst.x = dst.y = ... = <expression>"
  64       for non-per-component instructions.
  65       """
  66       assert isinstance(name, str)
  67       assert isinstance(output_size, int)
  68       assert isinstance(output_type, str)
  69       assert isinstance(input_sizes, list)
  70       assert isinstance(input_sizes[0], int)
  71       assert isinstance(input_types, list)
  72       assert isinstance(input_types[0], str)
  73       assert isinstance(algebraic_properties, str)
  74       assert isinstance(const_expr, str)
  75       assert len(input_sizes) == len(input_types)
  76       assert 0 <= output_size <= 4
  77       for size in input_sizes:
  78          assert 0 <= size <= 4
  79          if output_size != 0:
  80             assert size != 0
  81       self.name = name
  82       self.num_inputs = len(input_sizes)
  83       self.output_size = output_size
  84       self.output_type = output_type
  85       self.input_sizes = input_sizes
  86       self.input_types = input_types
  87       self.algebraic_properties = algebraic_properties
  88       self.const_expr = const_expr
  89
  90 # helper variables for strings
  91 tfloat = "float"
  92 tint = "int"
  93 tbool = "bool"
  94 tbool1 = "bool1"
  95 tbool32 = "bool32"
  96 tuint = "uint"
  97 tuint16 = "uint16"
  98 tfloat32 = "float32"
  99 tint32 = "int32"
 100 tuint32 = "uint32"
 101 tint64 = "int64"
 102 tuint64 = "uint64"
 103 tfloat64 = "float64"
 104
 105 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 106
 107 def type_has_size(type_):
 108     m = _TYPE_SPLIT_RE.match(type_)
 109     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 110     return m.group('bits') is not None
 111
 112 def type_size(type_):
 113     m = _TYPE_SPLIT_RE.match(type_)
 114     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 115     assert m.group('bits') is not None, \
 116            'NIR type string has no bit size: "{}"'.format(type_)
 117     return int(m.group('bits'))
 118
 119 def type_sizes(type_):
 120     if type_has_size(type_):
 121         return [type_size(type_)]
 122     elif type_ == 'bool':
 123         return [1, 32]
 124     elif type_ == 'float':
 125         return [16, 32, 64]
 126     else:
 127         return [1, 8, 16, 32, 64]
 128
 129 def type_base_type(type_):
 130     m = _TYPE_SPLIT_RE.match(type_)
 131     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 132     return m.group('type')
 133
 134 commutative = "commutative "
 135 associative = "associative "
 136
 137 # global dictionary of opcodes
 138 opcodes = {}
 139
 140 def opcode(name, output_size, output_type, input_sizes, input_types,
 141            algebraic_properties, const_expr):
 142    assert name not in opcodes
 143    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 144                           input_types, algebraic_properties, const_expr)
 145
 146 def unop_convert(name, out_type, in_type, const_expr):
 147    opcode(name, 0, out_type, [0], [in_type], "", const_expr)
 148
 149 def unop(name, ty, const_expr):
 150    opcode(name, 0, ty, [0], [ty], "", const_expr)
 151
 152 def unop_horiz(name, output_size, output_type, input_size, input_type,
 153                const_expr):
 154    opcode(name, output_size, output_type, [input_size], [input_type], "",
 155           const_expr)
 156
 157 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 158                 reduce_expr, final_expr):
 159    def prereduce(src):
 160       return "(" + prereduce_expr.format(src=src) + ")"
 161    def final(src):
 162       return final_expr.format(src="(" + src + ")")
 163    def reduce_(src0, src1):
 164       return reduce_expr.format(src0=src0, src1=src1)
 165    src0 = prereduce("src0.x")
 166    src1 = prereduce("src0.y")
 167    src2 = prereduce("src0.z")
 168    src3 = prereduce("src0.w")
 169    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 170               final(reduce_(src0, src1)))
 171    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 172               final(reduce_(reduce_(src0, src1), src2)))
 173    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 174               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 175
 176
 177 # These two move instructions differ in what modifiers they support and what
 178 # the negate modifier means. Otherwise, they are identical.
 179 unop("fmov", tfloat, "src0")
 180 unop("imov", tint, "src0")
 181
 182 unop("ineg", tint, "-src0")
 183 unop("fneg", tfloat, "-src0")
 184 unop("inot", tint, "~src0") # invert every bit of the integer
 185 unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
 186                       "((src0 == 0.0f) ? 1.0f : 0.0f)"))
 187 unop("fsign", tfloat, ("bit_size == 64 ? " +
 188                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 189                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 190 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 191 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 192 unop("fabs", tfloat, "fabs(src0)")
 193 unop("fsat", tfloat, ("bit_size == 64 ? " +
 194                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 195                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 196 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 197 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 198 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 199 unop("fexp2", tfloat, "exp2f(src0)")
 200 unop("flog2", tfloat, "log2f(src0)")
 201
 202 # Generate all of the numeric conversion opcodes
 203 for src_t in [tint, tuint, tfloat, tbool]:
 204    if src_t == tbool:
 205       dst_types = [tfloat, tint]
 206    elif src_t == tint:
 207       dst_types = [tfloat, tint, tbool]
 208    elif src_t == tuint:
 209       dst_types = [tfloat, tuint]
 210    elif src_t == tfloat:
 211       dst_types = [tint, tuint, tfloat, tbool]
 212
 213    for dst_t in dst_types:
 214       for bit_size in type_sizes(dst_t):
 215           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 216               rnd_modes = ['_rtne', '_rtz', '']
 217               for rnd_mode in rnd_modes:
 218                   unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
 219                                                        bit_size, rnd_mode),
 220                                dst_t + str(bit_size), src_t, "src0")
 221           else:
 222               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 223               unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 224                            dst_t + str(bit_size), src_t, conv_expr)
 225
 226
 227 # Unary floating-point rounding operations.
 228
 229
 230 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 231 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 232 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 233 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 234 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 235
 236 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 237
 238 # Trigonometric operations.
 239
 240
 241 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 242 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 243
 244 # dfrexp
 245 unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);")
 246 unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);")
 247
 248 # Partial derivatives.
 249
 250
 251 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 252 unop("fddy", tfloat, "0.0")
 253 unop("fddx_fine", tfloat, "0.0")
 254 unop("fddy_fine", tfloat, "0.0")
 255 unop("fddx_coarse", tfloat, "0.0")
 256 unop("fddy_coarse", tfloat, "0.0")
 257
 258
 259 # Floating point pack and unpack operations.
 260
 261 def pack_2x16(fmt):
 262    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 263 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 264 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 265 """.replace("fmt", fmt))
 266
 267 def pack_4x8(fmt):
 268    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 269 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 270 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 271 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 272 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 273 """.replace("fmt", fmt))
 274
 275 def unpack_2x16(fmt):
 276    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 277 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 278 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 279 """.replace("fmt", fmt))
 280
 281 def unpack_4x8(fmt):
 282    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 283 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 284 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 285 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 286 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 287 """.replace("fmt", fmt))
 288
 289
 290 pack_2x16("snorm")
 291 pack_4x8("snorm")
 292 pack_2x16("unorm")
 293 pack_4x8("unorm")
 294 pack_2x16("half")
 295 unpack_2x16("snorm")
 296 unpack_4x8("snorm")
 297 unpack_2x16("unorm")
 298 unpack_4x8("unorm")
 299 unpack_2x16("half")
 300
 301 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 302 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 303 """)
 304
 305 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 306 dst.x = (src0.x <<  0) |
 307         (src0.y <<  8) |
 308         (src0.z << 16) |
 309         (src0.w << 24);
 310 """)
 311
 312 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 313            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 314
 315 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 316            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 317
 318 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 319            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 320
 321 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 322            "dst.x = src0.x; dst.y = src0.x >> 32;")
 323
 324 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 325            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 326
 327 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 328            "dst.x = src0.x; dst.y = src0.x >> 16;")
 329
 330 # Lowered floating point unpacking operations.
 331
 332
 333 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 334              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 335 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 336              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 337
 338 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 339 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 340
 341 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 342 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 343
 344 # Bit operations, part of ARB_gpu_shader5.
 345
 346
 347 unop("bitfield_reverse", tuint32, """
 348 /* we're not winning any awards for speed here, but that's ok */
 349 dst = 0;
 350 for (unsigned bit = 0; bit < 32; bit++)
 351    dst |= ((src0 >> bit) & 1) << (31 - bit);
 352 """)
 353 unop_convert("bit_count", tuint32, tuint, """
 354 dst = 0;
 355 for (unsigned bit = 0; bit < bit_size; bit++) {
 356    if ((src0 >> bit) & 1)
 357       dst++;
 358 }
 359 """)
 360
 361 unop_convert("ufind_msb", tint32, tuint, """
 362 dst = -1;
 363 for (int bit = bit_size - 1; bit >= 0; bit--) {
 364    if ((src0 >> bit) & 1) {
 365       dst = bit;
 366       break;
 367    }
 368 }
 369 """)
 370
 371 unop("ifind_msb", tint32, """
 372 dst = -1;
 373 for (int bit = 31; bit >= 0; bit--) {
 374    /* If src0 < 0, we're looking for the first 0 bit.
 375     * if src0 >= 0, we're looking for the first 1 bit.
 376     */
 377    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 378       (!((src0 >> bit) & 1) && (src0 < 0))) {
 379       dst = bit;
 380       break;
 381    }
 382 }
 383 """)
 384
 385 unop_convert("find_lsb", tint32, tint, """
 386 dst = -1;
 387 for (unsigned bit = 0; bit < bit_size; bit++) {
 388    if ((src0 >> bit) & 1) {
 389       dst = bit;
 390       break;
 391    }
 392 }
 393 """)
 394
 395
 396 for i in range(1, 5):
 397    for j in range(1, 5):
 398       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 399
 400
 401 # AMD_gcn_shader extended instructions
 402 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 403 dst.x = dst.y = 0.0;
 404 float absX = fabs(src0.x);
 405 float absY = fabs(src0.y);
 406 float absZ = fabs(src0.z);
 407 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
 408 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
 409 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
 410 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
 411 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
 412 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
 413 """)
 414
 415 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 416 float absX = fabs(src0.x);
 417 float absY = fabs(src0.y);
 418 float absZ = fabs(src0.z);
 419 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 420 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 421 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 422 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 423 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 424 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 425 """)
 426
 427
 428 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 429    opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 430
 431 def binop(name, ty, alg_props, const_expr):
 432    binop_convert(name, ty, ty, alg_props, const_expr)
 433
 434 def binop_compare(name, ty, alg_props, const_expr):
 435    binop_convert(name, tbool1, ty, alg_props, const_expr)
 436
 437 def binop_compare32(name, ty, alg_props, const_expr):
 438    binop_convert(name, tbool32, ty, alg_props, const_expr)
 439
 440 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 441                 src2_type, const_expr):
 442    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 443           "", const_expr)
 444
 445 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 446                  reduce_expr, final_expr):
 447    def final(src):
 448       return final_expr.format(src= "(" + src + ")")
 449    def reduce_(src0, src1):
 450       return reduce_expr.format(src0=src0, src1=src1)
 451    def prereduce(src0, src1):
 452       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 453    src0 = prereduce("src0.x", "src1.x")
 454    src1 = prereduce("src0.y", "src1.y")
 455    src2 = prereduce("src0.z", "src1.z")
 456    src3 = prereduce("src0.w", "src1.w")
 457    opcode(name + "2", output_size, output_type,
 458           [2, 2], [src_type, src_type], commutative,
 459           final(reduce_(src0, src1)))
 460    opcode(name + "3", output_size, output_type,
 461           [3, 3], [src_type, src_type], commutative,
 462           final(reduce_(reduce_(src0, src1), src2)))
 463    opcode(name + "4", output_size, output_type,
 464           [4, 4], [src_type, src_type], commutative,
 465           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 466
 467 binop("fadd", tfloat, commutative + associative, "src0 + src1")
 468 binop("iadd", tint, commutative + associative, "src0 + src1")
 469 binop("iadd_sat", tint, commutative + associative, """
 470       src1 > 0 ?
 471          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 472          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 473 """)
 474 binop("uadd_sat", tuint, commutative,
 475       "(src0 + src1) < src0 ? UINT64_MAX : (src0 + src1)")
 476 binop("isub_sat", tint, "", """
 477       src1 < 0 ?
 478          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 479          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 480 """)
 481 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 482
 483 binop("fsub", tfloat, "", "src0 - src1")
 484 binop("isub", tint, "", "src0 - src1")
 485
 486 binop("fmul", tfloat, commutative + associative, "src0 * src1")
 487 # low 32-bits of signed/unsigned integer multiply
 488 binop("imul", tint, commutative + associative, "src0 * src1")
 489
 490 # Generate 64 bit result from 2 32 bits quantity
 491 binop_convert("imul_2x32_64", tint64, tint32, commutative,
 492               "(int64_t)src0 * (int64_t)src1")
 493 binop_convert("umul_2x32_64", tuint64, tuint32, commutative,
 494               "(uint64_t)src0 * (uint64_t)src1")
 495
 496 # high 32-bits of signed integer multiply
 497 binop("imul_high", tint, commutative, """
 498 if (bit_size == 64) {
 499    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 500     * extension to work properly.  The casts are kind-of annoying but needed
 501     * to prevent compiler warnings.
 502     */
 503    uint32_t src0_u32[4] = {
 504       src0,
 505       (int64_t)src0 >> 32,
 506       (int64_t)src0 >> 63,
 507       (int64_t)src0 >> 63,
 508    };
 509    uint32_t src1_u32[4] = {
 510       src1,
 511       (int64_t)src1 >> 32,
 512       (int64_t)src1 >> 63,
 513       (int64_t)src1 >> 63,
 514    };
 515    uint32_t prod_u32[4];
 516    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 517    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 518 } else {
 519    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 520 }
 521 """)
 522
 523 # high 32-bits of unsigned integer multiply
 524 binop("umul_high", tuint, commutative, """
 525 if (bit_size == 64) {
 526    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 527    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 528    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 529    uint32_t prod_u32[4];
 530    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 531    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 532 } else {
 533    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 534 }
 535 """)
 536
 537 binop("fdiv", tfloat, "", "src0 / src1")
 538 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 539 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 540
 541 # returns a boolean representing the carry resulting from the addition of
 542 # the two unsigned arguments.
 543
 544 binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
 545
 546 # returns a boolean representing the borrow resulting from the subtraction
 547 # of the two unsigned arguments.
 548
 549 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 550
 551 # hadd: (a + b) >> 1 (without overflow)
 552 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 553 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 554 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 555 #       =     ((x & y) << 1) + (x ^ y)
 556 #
 557 # Since we know that the bottom bit of (x & y) << 1 is zero,
 558 #
 559 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 560 #              =   (x & y) +      ((x ^ y)  >> 1)
 561 binop("ihadd", tint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 562 binop("uhadd", tuint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 563
 564 # rhadd: (a + b + 1) >> 1 (without overflow)
 565 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 566 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 567 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 568 #           =     ((x | y) << 1) - (x ^ y) + 1
 569 #
 570 # Since we know that the bottom bit of (x & y) << 1 is zero,
 571 #
 572 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 573 #                  = (x | y) -  ((x ^ y)      >> 1)
 574 binop("irhadd", tint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 575 binop("urhadd", tuint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 576
 577 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 578
 579 # For signed integers, there are several different possible definitions of
 580 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 581 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 582 # operation while the imod opcode implements the more mathematical
 583 # "modulus" operation.  For details on the difference, see
 584 #
 585 # http://mathforum.org/library/drmath/view/52343.html
 586
 587 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 588 binop("imod", tint, "",
 589       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 590       "                 src0 % src1 : src0 % src1 + src1)")
 591 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 592 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 593
 594 #
 595 # Comparisons
 596 #
 597
 598
 599 # these integer-aware comparisons return a boolean (0 or ~0)
 600
 601 binop_compare("flt", tfloat, "", "src0 < src1")
 602 binop_compare("fge", tfloat, "", "src0 >= src1")
 603 binop_compare("feq", tfloat, commutative, "src0 == src1")
 604 binop_compare("fne", tfloat, commutative, "src0 != src1")
 605 binop_compare("ilt", tint, "", "src0 < src1")
 606 binop_compare("ige", tint, "", "src0 >= src1")
 607 binop_compare("ieq", tint, commutative, "src0 == src1")
 608 binop_compare("ine", tint, commutative, "src0 != src1")
 609 binop_compare("ult", tuint, "", "src0 < src1")
 610 binop_compare("uge", tuint, "", "src0 >= src1")
 611 binop_compare32("flt32", tfloat, "", "src0 < src1")
 612 binop_compare32("fge32", tfloat, "", "src0 >= src1")
 613 binop_compare32("feq32", tfloat, commutative, "src0 == src1")
 614 binop_compare32("fne32", tfloat, commutative, "src0 != src1")
 615 binop_compare32("ilt32", tint, "", "src0 < src1")
 616 binop_compare32("ige32", tint, "", "src0 >= src1")
 617 binop_compare32("ieq32", tint, commutative, "src0 == src1")
 618 binop_compare32("ine32", tint, commutative, "src0 != src1")
 619 binop_compare32("ult32", tuint, "", "src0 < src1")
 620 binop_compare32("uge32", tuint, "", "src0 >= src1")
 621
 622 # integer-aware GLSL-style comparisons that compare floats and ints
 623
 624 binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
 625              "{src0} && {src1}", "{src}")
 626 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
 627              "{src0} || {src1}", "{src}")
 628 binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
 629              "{src0} && {src1}", "{src}")
 630 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
 631              "{src0} || {src1}", "{src}")
 632
 633 binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 634              "{src0} && {src1}", "{src}")
 635 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 636              "{src0} || {src1}", "{src}")
 637 binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
 638              "{src0} && {src1}", "{src}")
 639 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
 640              "{src0} || {src1}", "{src}")
 641
 642 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 643
 644 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 645              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 646 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 647              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 648
 649 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 650 # and false respectively
 651
 652 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 653 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 654 binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 655 binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 656
 657 # SPIRV shifts are undefined for shift-operands >= bitsize,
 658 # but SM5 shifts are defined to use the least significant bits, only
 659 # The NIR definition is according to the SM5 specification.
 660 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 661 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 662 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 663
 664 # bitwise logic operators
 665 #
 666 # These are also used as boolean and, or, xor for hardware supporting
 667 # integers.
 668
 669
 670 binop("iand", tuint, commutative + associative, "src0 & src1")
 671 binop("ior", tuint, commutative + associative, "src0 | src1")
 672 binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 673
 674
 675 # floating point logic operators
 676 #
 677 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 678 # for true and 0.0 for false
 679
 680 binop("fand", tfloat32, commutative,
 681       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
 682 binop("for", tfloat32, commutative,
 683       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
 684 binop("fxor", tfloat32, commutative,
 685       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 686
 687 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 688              "{src}")
 689
 690 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 691              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 692
 693 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
 694        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 695 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
 696        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 697
 698 binop("fmin", tfloat, "", "fminf(src0, src1)")
 699 binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
 700 binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
 701 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 702 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 703 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 704
 705 # Saturated vector add for 4 8bit ints.
 706 binop("usadd_4x8", tint32, commutative + associative, """
 707 dst = 0;
 708 for (int i = 0; i < 32; i += 8) {
 709    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 710 }
 711 """)
 712
 713 # Saturated vector subtract for 4 8bit ints.
 714 binop("ussub_4x8", tint32, "", """
 715 dst = 0;
 716 for (int i = 0; i < 32; i += 8) {
 717    int src0_chan = (src0 >> i) & 0xff;
 718    int src1_chan = (src1 >> i) & 0xff;
 719    if (src0_chan > src1_chan)
 720       dst |= (src0_chan - src1_chan) << i;
 721 }
 722 """)
 723
 724 # vector min for 4 8bit ints.
 725 binop("umin_4x8", tint32, commutative + associative, """
 726 dst = 0;
 727 for (int i = 0; i < 32; i += 8) {
 728    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 729 }
 730 """)
 731
 732 # vector max for 4 8bit ints.
 733 binop("umax_4x8", tint32, commutative + associative, """
 734 dst = 0;
 735 for (int i = 0; i < 32; i += 8) {
 736    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 737 }
 738 """)
 739
 740 # unorm multiply: (a * b) / 255.
 741 binop("umul_unorm_4x8", tint32, commutative + associative, """
 742 dst = 0;
 743 for (int i = 0; i < 32; i += 8) {
 744    int src0_chan = (src0 >> i) & 0xff;
 745    int src1_chan = (src1 >> i) & 0xff;
 746    dst |= ((src0_chan * src1_chan) / 255) << i;
 747 }
 748 """)
 749
 750 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 751
 752 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 753             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 754
 755 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 756               "src0 | ((uint64_t)src1 << 32)")
 757
 758 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 759               "src0 | ((uint32_t)src1 << 16)")
 760
 761 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 762 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 763 # if either of its arguments are 32.
 764 binop_convert("bfm", tuint32, tint32, "", """
 765 int bits = src0, offset = src1;
 766 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
 767    dst = 0; /* undefined */
 768 else
 769    dst = ((1u << bits) - 1) << offset;
 770 """)
 771
 772 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
 773 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 774 /* flush denormals to zero. */
 775 if (!isnormal(dst))
 776    dst = copysignf(0.0f, src0);
 777 """)
 778
 779 # Combines the first component of each input to make a 2-component vector.
 780
 781 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 782 dst.x = src0.x;
 783 dst.y = src1.x;
 784 """)
 785
 786 # Byte extraction
 787 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 788 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 789
 790 # Word extraction
 791 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 792 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 793
 794
 795 def triop(name, ty, const_expr):
 796    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 797 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 798    opcode(name, output_size, tuint,
 799    [src1_size, src2_size, src3_size],
 800    [tuint, tuint, tuint], "", const_expr)
 801
 802 triop("ffma", tfloat, "src0 * src1 + src2")
 803
 804 triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 805
 806 # Conditional Select
 807 #
 808 # A vector conditional select instruction (like ?:, but operating per-
 809 # component on vectors). There are two versions, one for floating point
 810 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 811
 812
 813 triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 814
 815 # 3 way min/max/med
 816 triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
 817 triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
 818 triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")
 819
 820 triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
 821 triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
 822 triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")
 823
 824 triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 825 triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 826 triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 827
 828 opcode("bcsel", 0, tuint, [0, 0, 0],
 829       [tbool1, tuint, tuint], "", "src0 ? src1 : src2")
 830 opcode("b32csel", 0, tuint, [0, 0, 0],
 831        [tbool32, tuint, tuint], "", "src0 ? src1 : src2")
 832
 833 # SM5 bfi assembly
 834 triop("bfi", tuint32, """
 835 unsigned mask = src0, insert = src1, base = src2;
 836 if (mask == 0) {
 837    dst = base;
 838 } else {
 839    unsigned tmp = mask;
 840    while (!(tmp & 1)) {
 841       tmp >>= 1;
 842       insert <<= 1;
 843    }
 844    dst = (base & ~mask) | (insert & mask);
 845 }
 846 """)
 847
 848 # SM5 ubfe/ibfe assembly
 849 opcode("ubfe", 0, tuint32,
 850        [0, 0, 0], [tuint32, tint32, tint32], "", """
 851 unsigned base = src0;
 852 int offset = src1, bits = src2;
 853 if (bits == 0) {
 854    dst = 0;
 855 } else if (bits < 0 || offset < 0) {
 856    dst = 0; /* undefined */
 857 } else if (offset + bits < 32) {
 858    dst = (base << (32 - bits - offset)) >> (32 - bits);
 859 } else {
 860    dst = base >> offset;
 861 }
 862 """)
 863 opcode("ibfe", 0, tint32,
 864        [0, 0, 0], [tint32, tint32, tint32], "", """
 865 int base = src0;
 866 int offset = src1, bits = src2;
 867 if (bits == 0) {
 868    dst = 0;
 869 } else if (bits < 0 || offset < 0) {
 870    dst = 0; /* undefined */
 871 } else if (offset + bits < 32) {
 872    dst = (base << (32 - bits - offset)) >> (32 - bits);
 873 } else {
 874    dst = base >> offset;
 875 }
 876 """)
 877
 878 # GLSL bitfieldExtract()
 879 opcode("ubitfield_extract", 0, tuint32,
 880        [0, 0, 0], [tuint32, tint32, tint32], "", """
 881 unsigned base = src0;
 882 int offset = src1, bits = src2;
 883 if (bits == 0) {
 884    dst = 0;
 885 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 886    dst = 0; /* undefined per the spec */
 887 } else {
 888    dst = (base >> offset) & ((1ull << bits) - 1);
 889 }
 890 """)
 891 opcode("ibitfield_extract", 0, tint32,
 892        [0, 0, 0], [tint32, tint32, tint32], "", """
 893 int base = src0;
 894 int offset = src1, bits = src2;
 895 if (bits == 0) {
 896    dst = 0;
 897 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 898    dst = 0;
 899 } else {
 900    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 901 }
 902 """)
 903
 904 # Combines the first component of each input to make a 3-component vector.
 905
 906 triop_horiz("vec3", 3, 1, 1, 1, """
 907 dst.x = src0.x;
 908 dst.y = src1.x;
 909 dst.z = src2.x;
 910 """)
 911
 912 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 913                  src4_size, const_expr):
 914    opcode(name, output_size, tuint,
 915           [src1_size, src2_size, src3_size, src4_size],
 916           [tuint, tuint, tuint, tuint],
 917           "", const_expr)
 918
 919 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 920        [tuint32, tuint32, tint32, tint32], "", """
 921 unsigned base = src0, insert = src1;
 922 int offset = src2, bits = src3;
 923 if (bits == 0) {
 924    dst = base;
 925 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 926    dst = 0;
 927 } else {
 928    unsigned mask = ((1ull << bits) - 1) << offset;
 929    dst = (base & ~mask) | ((insert << offset) & mask);
 930 }
 931 """)
 932
 933 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 934 dst.x = src0.x;
 935 dst.y = src1.x;
 936 dst.z = src2.x;
 937 dst.w = src3.x;
 938 """)
 939
 940