src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
  79       for size in input_sizes:
  80          assert 0 <= size <= 4 or (size == 8) or (size == 16)
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool8 = "bool8"
  99 tbool16 = "bool16"
 100 tbool32 = "bool32"
 101 tuint = "uint"
 102 tuint8 = "uint8"
 103 tint16 = "int16"
 104 tuint16 = "uint16"
 105 tfloat16 = "float16"
 106 tfloat32 = "float32"
 107 tint32 = "int32"
 108 tuint32 = "uint32"
 109 tint64 = "int64"
 110 tuint64 = "uint64"
 111 tfloat64 = "float64"
 112
 113 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 114
 115 def type_has_size(type_):
 116     m = _TYPE_SPLIT_RE.match(type_)
 117     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 118     return m.group('bits') is not None
 119
 120 def type_size(type_):
 121     m = _TYPE_SPLIT_RE.match(type_)
 122     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 123     assert m.group('bits') is not None, \
 124            'NIR type string has no bit size: "{}"'.format(type_)
 125     return int(m.group('bits'))
 126
 127 def type_sizes(type_):
 128     if type_has_size(type_):
 129         return [type_size(type_)]
 130     elif type_ == 'bool':
 131         return [1, 8, 16, 32]
 132     elif type_ == 'float':
 133         return [16, 32, 64]
 134     else:
 135         return [1, 8, 16, 32, 64]
 136
 137 def type_base_type(type_):
 138     m = _TYPE_SPLIT_RE.match(type_)
 139     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 140     return m.group('type')
 141
 142 # Operation where the first two sources are commutative.
 143 #
 144 # For 2-source operations, this just mathematical commutativity.  Some
 145 # 3-source operations, like ffma, are only commutative in the first two
 146 # sources.
 147 _2src_commutative = "2src_commutative "
 148 associative = "associative "
 149
 150 # global dictionary of opcodes
 151 opcodes = {}
 152
 153 def opcode(name, output_size, output_type, input_sizes, input_types,
 154            is_conversion, algebraic_properties, const_expr):
 155    assert name not in opcodes
 156    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 157                           input_types, is_conversion, algebraic_properties,
 158                           const_expr)
 159
 160 def unop_convert(name, out_type, in_type, const_expr):
 161    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 162
 163 def unop(name, ty, const_expr):
 164    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 165
 166 def unop_horiz(name, output_size, output_type, input_size, input_type,
 167                const_expr):
 168    opcode(name, output_size, output_type, [input_size], [input_type],
 169           False, "", const_expr)
 170
 171 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 172                 reduce_expr, final_expr):
 173    def prereduce(src):
 174       return "(" + prereduce_expr.format(src=src) + ")"
 175    def final(src):
 176       return final_expr.format(src="(" + src + ")")
 177    def reduce_(src0, src1):
 178       return reduce_expr.format(src0=src0, src1=src1)
 179    src0 = prereduce("src0.x")
 180    src1 = prereduce("src0.y")
 181    src2 = prereduce("src0.z")
 182    src3 = prereduce("src0.w")
 183    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 184               final(reduce_(src0, src1)))
 185    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 186               final(reduce_(reduce_(src0, src1), src2)))
 187    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 188               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 189
 190 def unop_numeric_convert(name, out_type, in_type, const_expr):
 191    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 192
 193 unop("mov", tuint, "src0")
 194
 195 unop("ineg", tint, "-src0")
 196 unop("fneg", tfloat, "-src0")
 197 unop("inot", tint, "~src0") # invert every bit of the integer
 198 unop("fsign", tfloat, ("bit_size == 64 ? " +
 199                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 200                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 201 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 202 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 203 unop("fabs", tfloat, "fabs(src0)")
 204 unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
 205 unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
 206 unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)"))
 207 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 208 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 209 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 210 unop("fexp2", tfloat, "exp2f(src0)")
 211 unop("flog2", tfloat, "log2f(src0)")
 212
 213 # Generate all of the numeric conversion opcodes
 214 for src_t in [tint, tuint, tfloat, tbool]:
 215    if src_t == tbool:
 216       dst_types = [tfloat, tint, tbool]
 217    elif src_t == tint:
 218       dst_types = [tfloat, tint, tbool]
 219    elif src_t == tuint:
 220       dst_types = [tfloat, tuint]
 221    elif src_t == tfloat:
 222       dst_types = [tint, tuint, tfloat, tbool]
 223
 224    for dst_t in dst_types:
 225       for dst_bit_size in type_sizes(dst_t):
 226           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 227               rnd_modes = ['_rtne', '_rtz', '']
 228               for rnd_mode in rnd_modes:
 229                   if rnd_mode == '_rtne':
 230                       conv_expr = """
 231                       if (bit_size > 16) {
 232                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
 233                       } else {
 234                          dst = src0;
 235                       }
 236                       """
 237                   elif rnd_mode == '_rtz':
 238                       conv_expr = """
 239                       if (bit_size > 16) {
 240                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
 241                       } else {
 242                          dst = src0;
 243                       }
 244                       """
 245                   else:
 246                       conv_expr = "src0"
 247
 248                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
 249                                                               dst_t[0],
 250                                                               dst_bit_size,
 251                                                               rnd_mode),
 252                                        dst_t + str(dst_bit_size),
 253                                        src_t, conv_expr)
 254           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 255               conv_expr = """
 256               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 257                  dst = _mesa_double_to_float_rtz(src0);
 258               } else {
 259                  dst = src0;
 260               }
 261               """
 262               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 263                                                        dst_bit_size),
 264                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 265           else:
 266               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 267               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
 268                                                        dst_bit_size),
 269                                    dst_t + str(dst_bit_size), src_t, conv_expr)
 270
 271 # Special opcode that is the same as f2f16, i2i16, u2u16 except that it is safe
 272 # to remove it if the result is immediately converted back to 32 bits again.
 273 # This is generated as part of the precision lowering pass. mp stands for medium
 274 # precision.
 275 unop_numeric_convert("f2fmp", tfloat16, tfloat, opcodes["f2f16"].const_expr)
 276 unop_numeric_convert("i2imp", tint16, tint, opcodes["i2i16"].const_expr)
 277 unop_numeric_convert("u2ump", tuint16, tuint, opcodes["u2u16"].const_expr)
 278
 279 # Unary floating-point rounding operations.
 280
 281
 282 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 283 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 284 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 285 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 286 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 287
 288 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 289
 290 # Trigonometric operations.
 291
 292
 293 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 294 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 295
 296 # dfrexp
 297 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 298 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 299
 300 # Partial derivatives.
 301
 302
 303 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 304 unop("fddy", tfloat, "0.0")
 305 unop("fddx_fine", tfloat, "0.0")
 306 unop("fddy_fine", tfloat, "0.0")
 307 unop("fddx_coarse", tfloat, "0.0")
 308 unop("fddy_coarse", tfloat, "0.0")
 309
 310
 311 # Floating point pack and unpack operations.
 312
 313 def pack_2x16(fmt):
 314    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 315 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 316 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 317 """.replace("fmt", fmt))
 318
 319 def pack_4x8(fmt):
 320    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 321 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 322 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 323 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 324 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 325 """.replace("fmt", fmt))
 326
 327 def unpack_2x16(fmt):
 328    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 329 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 330 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 331 """.replace("fmt", fmt))
 332
 333 def unpack_4x8(fmt):
 334    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 335 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 336 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 337 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 338 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 339 """.replace("fmt", fmt))
 340
 341
 342 pack_2x16("snorm")
 343 pack_4x8("snorm")
 344 pack_2x16("unorm")
 345 pack_4x8("unorm")
 346 pack_2x16("half")
 347 unpack_2x16("snorm")
 348 unpack_4x8("snorm")
 349 unpack_2x16("unorm")
 350 unpack_4x8("unorm")
 351 unpack_2x16("half")
 352
 353 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 354 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 355 """)
 356
 357 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 358 dst.x = (src0.x <<  0) |
 359         (src0.y <<  8) |
 360         (src0.z << 16) |
 361         (src0.w << 24);
 362 """)
 363
 364 unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
 365            "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
 366
 367 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 368            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 369
 370 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 371            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 372
 373 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 374            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 375
 376 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 377            "dst.x = src0.x; dst.y = src0.x >> 32;")
 378
 379 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 380            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 381
 382 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 383            "dst.x = src0.x; dst.y = src0.x >> 16;")
 384
 385 unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
 386            "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
 387
 388 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
 389 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
 390 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
 391 """)
 392
 393 # Lowered floating point unpacking operations.
 394
 395 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 396              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 397 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 398              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 399
 400 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
 401              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
 402 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
 403              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
 404
 405 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 406 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 407
 408 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 409 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 410
 411 # Bit operations, part of ARB_gpu_shader5.
 412
 413
 414 unop("bitfield_reverse", tuint32, """
 415 /* we're not winning any awards for speed here, but that's ok */
 416 dst = 0;
 417 for (unsigned bit = 0; bit < 32; bit++)
 418    dst |= ((src0 >> bit) & 1) << (31 - bit);
 419 """)
 420 unop_convert("bit_count", tuint32, tuint, """
 421 dst = 0;
 422 for (unsigned bit = 0; bit < bit_size; bit++) {
 423    if ((src0 >> bit) & 1)
 424       dst++;
 425 }
 426 """)
 427
 428 unop_convert("ufind_msb", tint32, tuint, """
 429 dst = -1;
 430 for (int bit = bit_size - 1; bit >= 0; bit--) {
 431    if ((src0 >> bit) & 1) {
 432       dst = bit;
 433       break;
 434    }
 435 }
 436 """)
 437
 438 unop("uclz", tuint32, """
 439 int bit;
 440 for (bit = bit_size - 1; bit >= 0; bit--) {
 441    if ((src0 & (1u << bit)) != 0)
 442       break;
 443 }
 444 dst = (unsigned)(31 - bit);
 445 """)
 446
 447 unop("ifind_msb", tint32, """
 448 dst = -1;
 449 for (int bit = 31; bit >= 0; bit--) {
 450    /* If src0 < 0, we're looking for the first 0 bit.
 451     * if src0 >= 0, we're looking for the first 1 bit.
 452     */
 453    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 454       (!((src0 >> bit) & 1) && (src0 < 0))) {
 455       dst = bit;
 456       break;
 457    }
 458 }
 459 """)
 460
 461 unop_convert("find_lsb", tint32, tint, """
 462 dst = -1;
 463 for (unsigned bit = 0; bit < bit_size; bit++) {
 464    if ((src0 >> bit) & 1) {
 465       dst = bit;
 466       break;
 467    }
 468 }
 469 """)
 470
 471 # AMD_gcn_shader extended instructions
 472 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 473 dst.x = dst.y = 0.0;
 474 float absX = fabsf(src0.x);
 475 float absY = fabsf(src0.y);
 476 float absZ = fabsf(src0.z);
 477
 478 float ma = 0.0;
 479 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 480 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 481 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 482
 483 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 484 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 485 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 486 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 487 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 488 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 489
 490 dst.x = dst.x * (1.0f / ma) + 0.5f;
 491 dst.y = dst.y * (1.0f / ma) + 0.5f;
 492 """)
 493
 494 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 495 float absX = fabsf(src0.x);
 496 float absY = fabsf(src0.y);
 497 float absZ = fabsf(src0.z);
 498 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 499 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 500 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 501 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 502 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 503 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 504 """)
 505
 506 # Sum of vector components
 507 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 508
 509 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 510    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 511           False, alg_props, const_expr)
 512
 513 def binop(name, ty, alg_props, const_expr):
 514    binop_convert(name, ty, ty, alg_props, const_expr)
 515
 516 def binop_compare(name, ty, alg_props, const_expr):
 517    binop_convert(name, tbool1, ty, alg_props, const_expr)
 518
 519 def binop_compare8(name, ty, alg_props, const_expr):
 520    binop_convert(name, tbool8, ty, alg_props, const_expr)
 521
 522 def binop_compare16(name, ty, alg_props, const_expr):
 523    binop_convert(name, tbool16, ty, alg_props, const_expr)
 524
 525 def binop_compare32(name, ty, alg_props, const_expr):
 526    binop_convert(name, tbool32, ty, alg_props, const_expr)
 527
 528 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
 529    binop_compare(name, ty, alg_props, const_expr)
 530    binop_compare8(name + "8", ty, alg_props, const_expr)
 531    binop_compare16(name + "16", ty, alg_props, const_expr)
 532    binop_compare32(name + "32", ty, alg_props, const_expr)
 533
 534 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 535                 src2_type, const_expr):
 536    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 537           False, "", const_expr)
 538
 539 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 540                  reduce_expr, final_expr):
 541    def final(src):
 542       return final_expr.format(src= "(" + src + ")")
 543    def reduce_(src0, src1):
 544       return reduce_expr.format(src0=src0, src1=src1)
 545    def prereduce(src0, src1):
 546       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 547    srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"]
 548    def pairwise_reduce(start, size):
 549       if (size == 1):
 550          return srcs[start]
 551       return reduce_(pairwise_reduce(start, size // 2), pairwise_reduce(start + size // 2, size // 2))
 552    for size in [2, 4, 8, 16]:
 553       opcode(name + str(size), output_size, output_type,
 554              [size, size], [src_type, src_type], False, _2src_commutative,
 555              final(pairwise_reduce(0, size)))
 556    opcode(name + "3", output_size, output_type,
 557           [3, 3], [src_type, src_type], False, _2src_commutative,
 558           final(reduce_(reduce_(srcs[0], srcs[1]), srcs[2])))
 559
 560 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
 561                            reduce_expr, final_expr):
 562    binop_reduce(name, output_size, tbool1, src_type,
 563                 prereduce_expr, reduce_expr, final_expr)
 564    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
 565                 prereduce_expr, reduce_expr, final_expr)
 566    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
 567                 prereduce_expr, reduce_expr, final_expr)
 568    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
 569                 prereduce_expr, reduce_expr, final_expr)
 570
 571 binop("fadd", tfloat, _2src_commutative + associative,"""
 572 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 573    if (bit_size == 64)
 574       dst = _mesa_double_add_rtz(src0, src1);
 575    else
 576       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
 577 } else {
 578    dst = src0 + src1;
 579 }
 580 """)
 581 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 582 binop("iadd_sat", tint, _2src_commutative, """
 583       src1 > 0 ?
 584          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 585          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 586 """)
 587 binop("uadd_sat", tuint, _2src_commutative,
 588       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 589 binop("isub_sat", tint, "", """
 590       src1 < 0 ?
 591          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 592          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 593 """)
 594 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 595
 596 binop("fsub", tfloat, "", """
 597 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 598    if (bit_size == 64)
 599       dst = _mesa_double_sub_rtz(src0, src1);
 600    else
 601       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
 602 } else {
 603    dst = src0 - src1;
 604 }
 605 """)
 606 binop("isub", tint, "", "src0 - src1")
 607 binop_convert("uabs_isub", tuint, tint, "", """
 608               src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
 609                           : (uint64_t) src0 - (uint64_t) src1
 610 """)
 611 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
 612
 613 binop("fmul", tfloat, _2src_commutative + associative, """
 614 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 615    if (bit_size == 64)
 616       dst = _mesa_double_mul_rtz(src0, src1);
 617    else
 618       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
 619 } else {
 620    dst = src0 * src1;
 621 }
 622 """)
 623 # low 32-bits of signed/unsigned integer multiply
 624 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 625
 626 # Generate 64 bit result from 2 32 bits quantity
 627 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 628               "(int64_t)src0 * (int64_t)src1")
 629 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 630               "(uint64_t)src0 * (uint64_t)src1")
 631
 632 # high 32-bits of signed integer multiply
 633 binop("imul_high", tint, _2src_commutative, """
 634 if (bit_size == 64) {
 635    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 636     * extension to work properly.  The casts are kind-of annoying but needed
 637     * to prevent compiler warnings.
 638     */
 639    uint32_t src0_u32[4] = {
 640       src0,
 641       (int64_t)src0 >> 32,
 642       (int64_t)src0 >> 63,
 643       (int64_t)src0 >> 63,
 644    };
 645    uint32_t src1_u32[4] = {
 646       src1,
 647       (int64_t)src1 >> 32,
 648       (int64_t)src1 >> 63,
 649       (int64_t)src1 >> 63,
 650    };
 651    uint32_t prod_u32[4];
 652    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 653    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 654 } else {
 655    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 656 }
 657 """)
 658
 659 # high 32-bits of unsigned integer multiply
 660 binop("umul_high", tuint, _2src_commutative, """
 661 if (bit_size == 64) {
 662    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 663    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 664    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 665    uint32_t prod_u32[4];
 666    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 667    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 668 } else {
 669    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 670 }
 671 """)
 672
 673 # low 32-bits of unsigned integer multiply
 674 binop("umul_low", tuint32, _2src_commutative, """
 675 uint64_t mask = (1 << (bit_size / 2)) - 1;
 676 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 677 """)
 678
 679 # Multiply 32-bits with low 16-bits.
 680 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
 681 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
 682
 683 binop("fdiv", tfloat, "", "src0 / src1")
 684 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 685 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 686
 687 # returns a boolean representing the carry resulting from the addition of
 688 # the two unsigned arguments.
 689
 690 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 691
 692 # returns a boolean representing the borrow resulting from the subtraction
 693 # of the two unsigned arguments.
 694
 695 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 696
 697 # hadd: (a + b) >> 1 (without overflow)
 698 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 699 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 700 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 701 #       =     ((x & y) << 1) + (x ^ y)
 702 #
 703 # Since we know that the bottom bit of (x & y) << 1 is zero,
 704 #
 705 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 706 #              =   (x & y) +      ((x ^ y)  >> 1)
 707 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 708 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 709
 710 # rhadd: (a + b + 1) >> 1 (without overflow)
 711 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 712 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 713 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 714 #           =     ((x | y) << 1) - (x ^ y) + 1
 715 #
 716 # Since we know that the bottom bit of (x & y) << 1 is zero,
 717 #
 718 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 719 #                  = (x | y) -  ((x ^ y)      >> 1)
 720 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 721 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 722
 723 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 724
 725 # For signed integers, there are several different possible definitions of
 726 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 727 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 728 # operation while the imod opcode implements the more mathematical
 729 # "modulus" operation.  For details on the difference, see
 730 #
 731 # http://mathforum.org/library/drmath/view/52343.html
 732
 733 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 734 binop("imod", tint, "",
 735       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 736       "                 src0 % src1 : src0 % src1 + src1)")
 737 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 738 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 739
 740 #
 741 # Comparisons
 742 #
 743
 744
 745 # these integer-aware comparisons return a boolean (0 or ~0)
 746
 747 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
 748 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
 749 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
 750 binop_compare_all_sizes("fne", tfloat, _2src_commutative, "src0 != src1")
 751 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
 752 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
 753 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
 754 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
 755 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
 756 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
 757
 758 # integer-aware GLSL-style comparisons that compare floats and ints
 759
 760 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
 761                        "{src0} && {src1}", "{src}")
 762 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
 763                        "{src0} || {src1}", "{src}")
 764 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
 765                        "{src0} && {src1}", "{src}")
 766 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
 767                        "{src0} || {src1}", "{src}")
 768
 769 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 770
 771 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 772              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 773 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 774              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 775
 776 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 777 # and false respectively
 778
 779 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 780 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 781 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 782 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 783
 784 # SPIRV shifts are undefined for shift-operands >= bitsize,
 785 # but SM5 shifts are defined to use the least significant bits, only
 786 # The NIR definition is according to the SM5 specification.
 787 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 788        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 789 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 790        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 791 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 792        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 793
 794 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 795    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 796    dst = (src0 << (src1 & rotate_mask)) |
 797          (src0 >> (-src1 & rotate_mask));
 798 """)
 799 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 800    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 801    dst = (src0 >> (src1 & rotate_mask)) |
 802          (src0 << (-src1 & rotate_mask));
 803 """)
 804
 805 # bitwise logic operators
 806 #
 807 # These are also used as boolean and, or, xor for hardware supporting
 808 # integers.
 809
 810
 811 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 812 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 813 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 814
 815
 816 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 817              "{src}")
 818
 819 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 820              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 821
 822 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 823        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 824 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 825        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 826
 827 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
 828 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 829 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 830 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
 831 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 832 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 833
 834 # Saturated vector add for 4 8bit ints.
 835 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 836 dst = 0;
 837 for (int i = 0; i < 32; i += 8) {
 838    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 839 }
 840 """)
 841
 842 # Saturated vector subtract for 4 8bit ints.
 843 binop("ussub_4x8", tint32, "", """
 844 dst = 0;
 845 for (int i = 0; i < 32; i += 8) {
 846    int src0_chan = (src0 >> i) & 0xff;
 847    int src1_chan = (src1 >> i) & 0xff;
 848    if (src0_chan > src1_chan)
 849       dst |= (src0_chan - src1_chan) << i;
 850 }
 851 """)
 852
 853 # vector min for 4 8bit ints.
 854 binop("umin_4x8", tint32, _2src_commutative + associative, """
 855 dst = 0;
 856 for (int i = 0; i < 32; i += 8) {
 857    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 858 }
 859 """)
 860
 861 # vector max for 4 8bit ints.
 862 binop("umax_4x8", tint32, _2src_commutative + associative, """
 863 dst = 0;
 864 for (int i = 0; i < 32; i += 8) {
 865    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 866 }
 867 """)
 868
 869 # unorm multiply: (a * b) / 255.
 870 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 871 dst = 0;
 872 for (int i = 0; i < 32; i += 8) {
 873    int src0_chan = (src0 >> i) & 0xff;
 874    int src1_chan = (src1 >> i) & 0xff;
 875    dst |= ((src0_chan * src1_chan) / 255) << i;
 876 }
 877 """)
 878
 879 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 880
 881 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 882             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 883
 884 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 885               "src0 | ((uint64_t)src1 << 32)")
 886
 887 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 888               "src0 | ((uint32_t)src1 << 16)")
 889
 890 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 891 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 892 # are from the low five bits of src0 and src1, respectively.
 893 binop_convert("bfm", tuint32, tint32, "", """
 894 int bits = src0 & 0x1F;
 895 int offset = src1 & 0x1F;
 896 dst = ((1u << bits) - 1) << offset;
 897 """)
 898
 899 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 900 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 901 /* flush denormals to zero. */
 902 if (!isnormal(dst))
 903    dst = copysignf(0.0f, src0);
 904 """)
 905
 906 # Combines the first component of each input to make a 2-component vector.
 907
 908 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 909 dst.x = src0.x;
 910 dst.y = src1.x;
 911 """)
 912
 913 # Byte extraction
 914 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 915 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 916
 917 # Word extraction
 918 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 919 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 920
 921
 922 def triop(name, ty, alg_props, const_expr):
 923    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 924 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 925    opcode(name, output_size, tuint,
 926    [src1_size, src2_size, src3_size],
 927    [tuint, tuint, tuint], False, "", const_expr)
 928
 929 triop("ffma", tfloat, _2src_commutative, """
 930 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 931    if (bit_size == 64)
 932       dst = _mesa_double_fma_rtz(src0, src1, src2);
 933    else if (bit_size == 32)
 934       dst = _mesa_float_fma_rtz(src0, src1, src2);
 935    else
 936       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
 937 } else {
 938    if (bit_size == 32)
 939       dst = fmaf(src0, src1, src2);
 940    else
 941       dst = fma(src0, src1, src2);
 942 }
 943 """)
 944
 945 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 946
 947 # Conditional Select
 948 #
 949 # A vector conditional select instruction (like ?:, but operating per-
 950 # component on vectors). There are two versions, one for floating point
 951 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 952
 953
 954 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 955
 956 # 3 way min/max/med
 957 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 958 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 959 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 960
 961 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 962 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 963 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 964
 965 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 966 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 967 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 968
 969 opcode("bcsel", 0, tuint, [0, 0, 0],
 970        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 971 opcode("b8csel", 0, tuint, [0, 0, 0],
 972        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
 973 opcode("b16csel", 0, tuint, [0, 0, 0],
 974        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
 975 opcode("b32csel", 0, tuint, [0, 0, 0],
 976        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 977
 978 # SM5 bfi assembly
 979 triop("bfi", tuint32, "", """
 980 unsigned mask = src0, insert = src1, base = src2;
 981 if (mask == 0) {
 982    dst = base;
 983 } else {
 984    unsigned tmp = mask;
 985    while (!(tmp & 1)) {
 986       tmp >>= 1;
 987       insert <<= 1;
 988    }
 989    dst = (base & ~mask) | (insert & mask);
 990 }
 991 """)
 992
 993
 994 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 995
 996 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 997 opcode("ubfe", 0, tuint32,
 998        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 999 unsigned base = src0;
1000 unsigned offset = src1 & 0x1F;
1001 unsigned bits = src2 & 0x1F;
1002 if (bits == 0) {
1003    dst = 0;
1004 } else if (offset + bits < 32) {
1005    dst = (base << (32 - bits - offset)) >> (32 - bits);
1006 } else {
1007    dst = base >> offset;
1008 }
1009 """)
1010 opcode("ibfe", 0, tint32,
1011        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1012 int base = src0;
1013 unsigned offset = src1 & 0x1F;
1014 unsigned bits = src2 & 0x1F;
1015 if (bits == 0) {
1016    dst = 0;
1017 } else if (offset + bits < 32) {
1018    dst = (base << (32 - bits - offset)) >> (32 - bits);
1019 } else {
1020    dst = base >> offset;
1021 }
1022 """)
1023
1024 # GLSL bitfieldExtract()
1025 opcode("ubitfield_extract", 0, tuint32,
1026        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1027 unsigned base = src0;
1028 int offset = src1, bits = src2;
1029 if (bits == 0) {
1030    dst = 0;
1031 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1032    dst = 0; /* undefined per the spec */
1033 } else {
1034    dst = (base >> offset) & ((1ull << bits) - 1);
1035 }
1036 """)
1037 opcode("ibitfield_extract", 0, tint32,
1038        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1039 int base = src0;
1040 int offset = src1, bits = src2;
1041 if (bits == 0) {
1042    dst = 0;
1043 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1044    dst = 0;
1045 } else {
1046    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1047 }
1048 """)
1049
1050 # Combines the first component of each input to make a 3-component vector.
1051
1052 triop_horiz("vec3", 3, 1, 1, 1, """
1053 dst.x = src0.x;
1054 dst.y = src1.x;
1055 dst.z = src2.x;
1056 """)
1057
1058 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1059                  src4_size, const_expr):
1060    opcode(name, output_size, tuint,
1061           [src1_size, src2_size, src3_size, src4_size],
1062           [tuint, tuint, tuint, tuint],
1063           False, "", const_expr)
1064
1065 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1066        [tuint32, tuint32, tint32, tint32], False, "", """
1067 unsigned base = src0, insert = src1;
1068 int offset = src2, bits = src3;
1069 if (bits == 0) {
1070    dst = base;
1071 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1072    dst = 0;
1073 } else {
1074    unsigned mask = ((1ull << bits) - 1) << offset;
1075    dst = (base & ~mask) | ((insert << offset) & mask);
1076 }
1077 """)
1078
1079 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1080 dst.x = src0.x;
1081 dst.y = src1.x;
1082 dst.z = src2.x;
1083 dst.w = src3.x;
1084 """)
1085
1086 opcode("vec8", 8, tuint,
1087        [1] * 8, [tuint] * 8,
1088        False, "", """
1089 dst.x = src0.x;
1090 dst.y = src1.x;
1091 dst.z = src2.x;
1092 dst.w = src3.x;
1093 dst.e = src4.x;
1094 dst.f = src5.x;
1095 dst.g = src6.x;
1096 dst.h = src7.x;
1097 """)
1098
1099 opcode("vec16", 16, tuint,
1100        [1] * 16, [tuint] * 16,
1101        False, "", """
1102 dst.x = src0.x;
1103 dst.y = src1.x;
1104 dst.z = src2.x;
1105 dst.w = src3.x;
1106 dst.e = src4.x;
1107 dst.f = src5.x;
1108 dst.g = src6.x;
1109 dst.h = src7.x;
1110 dst.i = src8.x;
1111 dst.j = src9.x;
1112 dst.k = src10.x;
1113 dst.l = src11.x;
1114 dst.m = src12.x;
1115 dst.n = src13.x;
1116 dst.o = src14.x;
1117 dst.p = src15.x;
1118 """)
1119
1120 # An integer multiply instruction for address calculation.  This is
1121 # similar to imul, except that the results are undefined in case of
1122 # overflow.  Overflow is defined according to the size of the variable
1123 # being dereferenced.
1124 #
1125 # This relaxed definition, compared to imul, allows an optimization
1126 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1127 # sources, such that lower precision integer multiplies can be used.
1128 # This is useful on hw that has 24b or perhaps 16b integer multiply
1129 # instructions.
1130 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1131
1132 # ir3-specific instruction that maps directly to mul-add shift high mix,
1133 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1134 # multiplication (imul) on Freedreno backend..
1135 opcode("imadsh_mix16", 0, tint32,
1136        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1137 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1138 """)
1139
1140 # ir3-specific instruction that maps directly to ir3 mad.s24.
1141 #
1142 # 24b multiply into 32b result (with sign extension) plus 32b int
1143 triop("imad24_ir3", tint32, _2src_commutative,
1144       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1145
1146 # 24b multiply into 32b result (with sign extension)
1147 binop("imul24", tint32, _2src_commutative + associative,
1148       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1149
1150 # unsigned 24b multiply into 32b result plus 32b int
1151 triop("umad24", tuint32, _2src_commutative,
1152       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1153
1154 # unsigned 24b multiply into 32b result uint
1155 binop("umul24", tint32, _2src_commutative + associative,
1156       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")