src/compiler/nir/nir_opcodes.py

   1 #
   2 # Copyright (C) 2014 Connor Abbott
   3 #
   4 # Permission is hereby granted, free of charge, to any person obtaining a
   5 # copy of this software and associated documentation files (the "Software"),
   6 # to deal in the Software without restriction, including without limitation
   7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 # and/or sell copies of the Software, and to permit persons to whom the
   9 # Software is furnished to do so, subject to the following conditions:
  10 #
  11 # The above copyright notice and this permission notice (including the next
  12 # paragraph) shall be included in all copies or substantial portions of the
  13 # Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 # IN THE SOFTWARE.
  22 #
  23 # Authors:
  24 #    Connor Abbott (cwabbott0@gmail.com)
  25
  26 import re
  27
  28 # Class that represents all the information we have about the opcode
  29 # NOTE: this must be kept in sync with nir_op_info
  30
  31 class Opcode(object):
  32    """Class that represents all the information we have about the opcode
  33    NOTE: this must be kept in sync with nir_op_info
  34    """
  35    def __init__(self, name, output_size, output_type, input_sizes,
  36                 input_types, is_conversion, algebraic_properties, const_expr):
  37       """Parameters:
  38
  39       - name is the name of the opcode (prepend nir_op_ for the enum name)
  40       - all types are strings that get nir_type_ prepended to them
  41       - input_types is a list of types
  42       - is_conversion is true if this opcode represents a type conversion
  43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
  44         prepended before each entry
  45       - const_expr is an expression or series of statements that computes the
  46         constant value of the opcode given the constant values of its inputs.
  47
  48       Constant expressions are formed from the variables src0, src1, ...,
  49       src(N-1), where N is the number of arguments.  The output of the
  50       expression should be stored in the dst variable.  Per-component input
  51       and output variables will be scalars and non-per-component input and
  52       output variables will be a struct with fields named x, y, z, and w
  53       all of the correct type.  Input and output variables can be assumed
  54       to already be of the correct type and need no conversion.  In
  55       particular, the conversion from the C bool type to/from  NIR_TRUE and
  56       NIR_FALSE happens automatically.
  57
  58       For per-component instructions, the entire expression will be
  59       executed once for each component.  For non-per-component
  60       instructions, the expression is expected to store the correct values
  61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
  62       constant expression, an assignment to dst will happen automatically
  63       and the result will be equivalent to "dst = <expression>" for
  64       per-component instructions and "dst.x = dst.y = ... = <expression>"
  65       for non-per-component instructions.
  66       """
  67       assert isinstance(name, str)
  68       assert isinstance(output_size, int)
  69       assert isinstance(output_type, str)
  70       assert isinstance(input_sizes, list)
  71       assert isinstance(input_sizes[0], int)
  72       assert isinstance(input_types, list)
  73       assert isinstance(input_types[0], str)
  74       assert isinstance(is_conversion, bool)
  75       assert isinstance(algebraic_properties, str)
  76       assert isinstance(const_expr, str)
  77       assert len(input_sizes) == len(input_types)
  78       assert 0 <= output_size <= 4
  79       for size in input_sizes:
  80          assert 0 <= size <= 4
  81          if output_size != 0:
  82             assert size != 0
  83       self.name = name
  84       self.num_inputs = len(input_sizes)
  85       self.output_size = output_size
  86       self.output_type = output_type
  87       self.input_sizes = input_sizes
  88       self.input_types = input_types
  89       self.is_conversion = is_conversion
  90       self.algebraic_properties = algebraic_properties
  91       self.const_expr = const_expr
  92
  93 # helper variables for strings
  94 tfloat = "float"
  95 tint = "int"
  96 tbool = "bool"
  97 tbool1 = "bool1"
  98 tbool32 = "bool32"
  99 tuint = "uint"
 100 tuint16 = "uint16"
 101 tfloat32 = "float32"
 102 tint32 = "int32"
 103 tuint32 = "uint32"
 104 tint64 = "int64"
 105 tuint64 = "uint64"
 106 tfloat64 = "float64"
 107
 108 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
 109
 110 def type_has_size(type_):
 111     m = _TYPE_SPLIT_RE.match(type_)
 112     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 113     return m.group('bits') is not None
 114
 115 def type_size(type_):
 116     m = _TYPE_SPLIT_RE.match(type_)
 117     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 118     assert m.group('bits') is not None, \
 119            'NIR type string has no bit size: "{}"'.format(type_)
 120     return int(m.group('bits'))
 121
 122 def type_sizes(type_):
 123     if type_has_size(type_):
 124         return [type_size(type_)]
 125     elif type_ == 'bool':
 126         return [1, 32]
 127     elif type_ == 'float':
 128         return [16, 32, 64]
 129     else:
 130         return [1, 8, 16, 32, 64]
 131
 132 def type_base_type(type_):
 133     m = _TYPE_SPLIT_RE.match(type_)
 134     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
 135     return m.group('type')
 136
 137 # Operation where the first two sources are commutative.
 138 #
 139 # For 2-source operations, this just mathematical commutativity.  Some
 140 # 3-source operations, like ffma, are only commutative in the first two
 141 # sources.
 142 _2src_commutative = "2src_commutative "
 143 associative = "associative "
 144
 145 # global dictionary of opcodes
 146 opcodes = {}
 147
 148 def opcode(name, output_size, output_type, input_sizes, input_types,
 149            is_conversion, algebraic_properties, const_expr):
 150    assert name not in opcodes
 151    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
 152                           input_types, is_conversion, algebraic_properties,
 153                           const_expr)
 154
 155 def unop_convert(name, out_type, in_type, const_expr):
 156    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
 157
 158 def unop(name, ty, const_expr):
 159    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
 160
 161 def unop_horiz(name, output_size, output_type, input_size, input_type,
 162                const_expr):
 163    opcode(name, output_size, output_type, [input_size], [input_type],
 164           False, "", const_expr)
 165
 166 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
 167                 reduce_expr, final_expr):
 168    def prereduce(src):
 169       return "(" + prereduce_expr.format(src=src) + ")"
 170    def final(src):
 171       return final_expr.format(src="(" + src + ")")
 172    def reduce_(src0, src1):
 173       return reduce_expr.format(src0=src0, src1=src1)
 174    src0 = prereduce("src0.x")
 175    src1 = prereduce("src0.y")
 176    src2 = prereduce("src0.z")
 177    src3 = prereduce("src0.w")
 178    unop_horiz(name + "2", output_size, output_type, 2, input_type,
 179               final(reduce_(src0, src1)))
 180    unop_horiz(name + "3", output_size, output_type, 3, input_type,
 181               final(reduce_(reduce_(src0, src1), src2)))
 182    unop_horiz(name + "4", output_size, output_type, 4, input_type,
 183               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 184
 185 def unop_numeric_convert(name, out_type, in_type, const_expr):
 186    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
 187
 188 unop("mov", tuint, "src0")
 189
 190 unop("ineg", tint, "-src0")
 191 unop("fneg", tfloat, "-src0")
 192 unop("inot", tint, "~src0") # invert every bit of the integer
 193 unop("fsign", tfloat, ("bit_size == 64 ? " +
 194                        "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
 195                        "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
 196 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
 197 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
 198 unop("fabs", tfloat, "fabs(src0)")
 199 unop("fsat", tfloat, ("bit_size == 64 ? " +
 200                       "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
 201                       "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
 202 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
 203 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
 204 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
 205 unop("fexp2", tfloat, "exp2f(src0)")
 206 unop("flog2", tfloat, "log2f(src0)")
 207
 208 # Generate all of the numeric conversion opcodes
 209 for src_t in [tint, tuint, tfloat, tbool]:
 210    if src_t == tbool:
 211       dst_types = [tfloat, tint]
 212    elif src_t == tint:
 213       dst_types = [tfloat, tint, tbool]
 214    elif src_t == tuint:
 215       dst_types = [tfloat, tuint]
 216    elif src_t == tfloat:
 217       dst_types = [tint, tuint, tfloat, tbool]
 218
 219    for dst_t in dst_types:
 220       for bit_size in type_sizes(dst_t):
 221           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
 222               rnd_modes = ['_rtne', '_rtz', '']
 223               for rnd_mode in rnd_modes:
 224                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
 225                                                               bit_size, rnd_mode),
 226                                        dst_t + str(bit_size), src_t, "src0")
 227           elif bit_size == 32 and dst_t == tfloat and src_t == tfloat:
 228               conv_expr = """
 229               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
 230                  dst = _mesa_double_to_float_rtz(src0);
 231               } else {
 232                  dst = src0;
 233               }
 234               """
 235               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 236                                    dst_t + str(bit_size), src_t, conv_expr)
 237           else:
 238               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
 239               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
 240                                    dst_t + str(bit_size), src_t, conv_expr)
 241
 242
 243 # Unary floating-point rounding operations.
 244
 245
 246 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
 247 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
 248 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 249 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 250 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 251
 252 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 253
 254 # Trigonometric operations.
 255
 256
 257 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
 258 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 259
 260 # dfrexp
 261 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
 262 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
 263
 264 # Partial derivatives.
 265
 266
 267 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
 268 unop("fddy", tfloat, "0.0")
 269 unop("fddx_fine", tfloat, "0.0")
 270 unop("fddy_fine", tfloat, "0.0")
 271 unop("fddx_coarse", tfloat, "0.0")
 272 unop("fddy_coarse", tfloat, "0.0")
 273
 274
 275 # Floating point pack and unpack operations.
 276
 277 def pack_2x16(fmt):
 278    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 279 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 280 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 281 """.replace("fmt", fmt))
 282
 283 def pack_4x8(fmt):
 284    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 285 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 286 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 287 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
 288 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 289 """.replace("fmt", fmt))
 290
 291 def unpack_2x16(fmt):
 292    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 293 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 294 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 295 """.replace("fmt", fmt))
 296
 297 def unpack_4x8(fmt):
 298    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 299 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 300 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 301 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
 302 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
 303 """.replace("fmt", fmt))
 304
 305
 306 pack_2x16("snorm")
 307 pack_4x8("snorm")
 308 pack_2x16("unorm")
 309 pack_4x8("unorm")
 310 pack_2x16("half")
 311 unpack_2x16("snorm")
 312 unpack_4x8("snorm")
 313 unpack_2x16("unorm")
 314 unpack_4x8("unorm")
 315 unpack_2x16("half")
 316
 317 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 318 dst.x = (src0.x & 0xffff) | (src0.y << 16);
 319 """)
 320
 321 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 322 dst.x = (src0.x <<  0) |
 323         (src0.y <<  8) |
 324         (src0.z << 16) |
 325         (src0.w << 24);
 326 """)
 327
 328 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
 329            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
 330
 331 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
 332            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 333
 334 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
 335            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
 336
 337 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
 338            "dst.x = src0.x; dst.y = src0.x >> 32;")
 339
 340 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
 341            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
 342
 343 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
 344            "dst.x = src0.x; dst.y = src0.x >> 16;")
 345
 346 # Lowered floating point unpacking operations.
 347
 348
 349 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
 350              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
 351 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
 352              "unpack_half_1x16((uint16_t)(src0 >> 16))")
 353
 354 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
 355 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
 356
 357 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 358 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 359
 360 # Bit operations, part of ARB_gpu_shader5.
 361
 362
 363 unop("bitfield_reverse", tuint32, """
 364 /* we're not winning any awards for speed here, but that's ok */
 365 dst = 0;
 366 for (unsigned bit = 0; bit < 32; bit++)
 367    dst |= ((src0 >> bit) & 1) << (31 - bit);
 368 """)
 369 unop_convert("bit_count", tuint32, tuint, """
 370 dst = 0;
 371 for (unsigned bit = 0; bit < bit_size; bit++) {
 372    if ((src0 >> bit) & 1)
 373       dst++;
 374 }
 375 """)
 376
 377 unop_convert("ufind_msb", tint32, tuint, """
 378 dst = -1;
 379 for (int bit = bit_size - 1; bit >= 0; bit--) {
 380    if ((src0 >> bit) & 1) {
 381       dst = bit;
 382       break;
 383    }
 384 }
 385 """)
 386
 387 unop("ifind_msb", tint32, """
 388 dst = -1;
 389 for (int bit = 31; bit >= 0; bit--) {
 390    /* If src0 < 0, we're looking for the first 0 bit.
 391     * if src0 >= 0, we're looking for the first 1 bit.
 392     */
 393    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
 394       (!((src0 >> bit) & 1) && (src0 < 0))) {
 395       dst = bit;
 396       break;
 397    }
 398 }
 399 """)
 400
 401 unop_convert("find_lsb", tint32, tint, """
 402 dst = -1;
 403 for (unsigned bit = 0; bit < bit_size; bit++) {
 404    if ((src0 >> bit) & 1) {
 405       dst = bit;
 406       break;
 407    }
 408 }
 409 """)
 410
 411
 412 for i in range(1, 5):
 413    for j in range(1, 5):
 414       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 415
 416
 417 # AMD_gcn_shader extended instructions
 418 unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
 419 dst.x = dst.y = 0.0;
 420 float absX = fabs(src0.x);
 421 float absY = fabs(src0.y);
 422 float absZ = fabs(src0.z);
 423
 424 float ma = 0.0;
 425 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
 426 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
 427 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
 428
 429 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
 430 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
 431 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
 432 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
 433 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
 434 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
 435
 436 dst.x = dst.x / ma + 0.5;
 437 dst.y = dst.y / ma + 0.5;
 438 """)
 439
 440 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
 441 float absX = fabs(src0.x);
 442 float absY = fabs(src0.y);
 443 float absZ = fabs(src0.z);
 444 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
 445 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
 446 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
 447 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
 448 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
 449 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
 450 """)
 451
 452 # Sum of vector components
 453 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
 454
 455 def binop_convert(name, out_type, in_type, alg_props, const_expr):
 456    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
 457           False, alg_props, const_expr)
 458
 459 def binop(name, ty, alg_props, const_expr):
 460    binop_convert(name, ty, ty, alg_props, const_expr)
 461
 462 def binop_compare(name, ty, alg_props, const_expr):
 463    binop_convert(name, tbool1, ty, alg_props, const_expr)
 464
 465 def binop_compare32(name, ty, alg_props, const_expr):
 466    binop_convert(name, tbool32, ty, alg_props, const_expr)
 467
 468 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
 469                 src2_type, const_expr):
 470    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
 471           False, "", const_expr)
 472
 473 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
 474                  reduce_expr, final_expr):
 475    def final(src):
 476       return final_expr.format(src= "(" + src + ")")
 477    def reduce_(src0, src1):
 478       return reduce_expr.format(src0=src0, src1=src1)
 479    def prereduce(src0, src1):
 480       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
 481    src0 = prereduce("src0.x", "src1.x")
 482    src1 = prereduce("src0.y", "src1.y")
 483    src2 = prereduce("src0.z", "src1.z")
 484    src3 = prereduce("src0.w", "src1.w")
 485    opcode(name + "2", output_size, output_type,
 486           [2, 2], [src_type, src_type], False, _2src_commutative,
 487           final(reduce_(src0, src1)))
 488    opcode(name + "3", output_size, output_type,
 489           [3, 3], [src_type, src_type], False, _2src_commutative,
 490           final(reduce_(reduce_(src0, src1), src2)))
 491    opcode(name + "4", output_size, output_type,
 492           [4, 4], [src_type, src_type], False, _2src_commutative,
 493           final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 494
 495 binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1")
 496 binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
 497 binop("iadd_sat", tint, _2src_commutative, """
 498       src1 > 0 ?
 499          (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
 500          (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
 501 """)
 502 binop("uadd_sat", tuint, _2src_commutative,
 503       "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
 504 binop("isub_sat", tint, "", """
 505       src1 < 0 ?
 506          (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
 507          (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
 508 """)
 509 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
 510
 511 binop("fsub", tfloat, "", "src0 - src1")
 512 binop("isub", tint, "", "src0 - src1")
 513
 514 binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1")
 515 # low 32-bits of signed/unsigned integer multiply
 516 binop("imul", tint, _2src_commutative + associative, "src0 * src1")
 517
 518 # Generate 64 bit result from 2 32 bits quantity
 519 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
 520               "(int64_t)src0 * (int64_t)src1")
 521 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
 522               "(uint64_t)src0 * (uint64_t)src1")
 523
 524 # high 32-bits of signed integer multiply
 525 binop("imul_high", tint, _2src_commutative, """
 526 if (bit_size == 64) {
 527    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
 528     * extension to work properly.  The casts are kind-of annoying but needed
 529     * to prevent compiler warnings.
 530     */
 531    uint32_t src0_u32[4] = {
 532       src0,
 533       (int64_t)src0 >> 32,
 534       (int64_t)src0 >> 63,
 535       (int64_t)src0 >> 63,
 536    };
 537    uint32_t src1_u32[4] = {
 538       src1,
 539       (int64_t)src1 >> 32,
 540       (int64_t)src1 >> 63,
 541       (int64_t)src1 >> 63,
 542    };
 543    uint32_t prod_u32[4];
 544    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 545    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 546 } else {
 547    dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
 548 }
 549 """)
 550
 551 # high 32-bits of unsigned integer multiply
 552 binop("umul_high", tuint, _2src_commutative, """
 553 if (bit_size == 64) {
 554    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
 555    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
 556    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
 557    uint32_t prod_u32[4];
 558    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
 559    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
 560 } else {
 561    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
 562 }
 563 """)
 564
 565 # low 32-bits of unsigned integer multiply
 566 binop("umul_low", tuint32, _2src_commutative, """
 567 uint64_t mask = (1 << (bit_size / 2)) - 1;
 568 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
 569 """)
 570
 571
 572 binop("fdiv", tfloat, "", "src0 / src1")
 573 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
 574 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
 575
 576 # returns a boolean representing the carry resulting from the addition of
 577 # the two unsigned arguments.
 578
 579 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
 580
 581 # returns a boolean representing the borrow resulting from the subtraction
 582 # of the two unsigned arguments.
 583
 584 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
 585
 586 # hadd: (a + b) >> 1 (without overflow)
 587 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
 588 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
 589 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
 590 #       =     ((x & y) << 1) + (x ^ y)
 591 #
 592 # Since we know that the bottom bit of (x & y) << 1 is zero,
 593 #
 594 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
 595 #              =   (x & y) +      ((x ^ y)  >> 1)
 596 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 597 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
 598
 599 # rhadd: (a + b + 1) >> 1 (without overflow)
 600 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
 601 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
 602 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
 603 #           =     ((x | y) << 1) - (x ^ y) + 1
 604 #
 605 # Since we know that the bottom bit of (x & y) << 1 is zero,
 606 #
 607 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
 608 #                  = (x | y) -  ((x ^ y)      >> 1)
 609 binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 610 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
 611
 612 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
 613
 614 # For signed integers, there are several different possible definitions of
 615 # "modulus" or "remainder".  We follow the conventions used by LLVM and
 616 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
 617 # operation while the imod opcode implements the more mathematical
 618 # "modulus" operation.  For details on the difference, see
 619 #
 620 # http://mathforum.org/library/drmath/view/52343.html
 621
 622 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
 623 binop("imod", tint, "",
 624       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
 625       "                 src0 % src1 : src0 % src1 + src1)")
 626 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
 627 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
 628
 629 #
 630 # Comparisons
 631 #
 632
 633
 634 # these integer-aware comparisons return a boolean (0 or ~0)
 635
 636 binop_compare("flt", tfloat, "", "src0 < src1")
 637 binop_compare("fge", tfloat, "", "src0 >= src1")
 638 binop_compare("feq", tfloat, _2src_commutative, "src0 == src1")
 639 binop_compare("fne", tfloat, _2src_commutative, "src0 != src1")
 640 binop_compare("ilt", tint, "", "src0 < src1")
 641 binop_compare("ige", tint, "", "src0 >= src1")
 642 binop_compare("ieq", tint, _2src_commutative, "src0 == src1")
 643 binop_compare("ine", tint, _2src_commutative, "src0 != src1")
 644 binop_compare("ult", tuint, "", "src0 < src1")
 645 binop_compare("uge", tuint, "", "src0 >= src1")
 646 binop_compare32("flt32", tfloat, "", "src0 < src1")
 647 binop_compare32("fge32", tfloat, "", "src0 >= src1")
 648 binop_compare32("feq32", tfloat, _2src_commutative, "src0 == src1")
 649 binop_compare32("fne32", tfloat, _2src_commutative, "src0 != src1")
 650 binop_compare32("ilt32", tint, "", "src0 < src1")
 651 binop_compare32("ige32", tint, "", "src0 >= src1")
 652 binop_compare32("ieq32", tint, _2src_commutative, "src0 == src1")
 653 binop_compare32("ine32", tint, _2src_commutative, "src0 != src1")
 654 binop_compare32("ult32", tuint, "", "src0 < src1")
 655 binop_compare32("uge32", tuint, "", "src0 >= src1")
 656
 657 # integer-aware GLSL-style comparisons that compare floats and ints
 658
 659 binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
 660              "{src0} && {src1}", "{src}")
 661 binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
 662              "{src0} || {src1}", "{src}")
 663 binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
 664              "{src0} && {src1}", "{src}")
 665 binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
 666              "{src0} || {src1}", "{src}")
 667
 668 binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
 669              "{src0} && {src1}", "{src}")
 670 binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
 671              "{src0} || {src1}", "{src}")
 672 binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
 673              "{src0} && {src1}", "{src}")
 674 binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
 675              "{src0} || {src1}", "{src}")
 676
 677 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 678
 679 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
 680              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
 681 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
 682              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 683
 684 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 685 # and false respectively
 686
 687 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
 688 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
 689 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
 690 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 691
 692 # SPIRV shifts are undefined for shift-operands >= bitsize,
 693 # but SM5 shifts are defined to use the least significant bits, only
 694 # The NIR definition is according to the SM5 specification.
 695 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
 696        "src0 << (src1 & (sizeof(src0) * 8 - 1))")
 697 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
 698        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 699 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
 700        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
 701
 702 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 703    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 704    dst = (src0 << (src1 & rotate_mask)) |
 705          (src0 >> (-src1 & rotate_mask));
 706 """)
 707 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
 708    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
 709    dst = (src0 >> (src1 & rotate_mask)) |
 710          (src0 << (-src1 & rotate_mask));
 711 """)
 712
 713 # bitwise logic operators
 714 #
 715 # These are also used as boolean and, or, xor for hardware supporting
 716 # integers.
 717
 718
 719 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
 720 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
 721 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
 722
 723
 724 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
 725              "{src}")
 726
 727 binop_reduce("fdot_replicated", 4, tfloat, tfloat,
 728              "{src0} * {src1}", "{src0} + {src1}", "{src}")
 729
 730 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
 731        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 732 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
 733        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
 734
 735 binop("fmin", tfloat, "", "fminf(src0, src1)")
 736 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 737 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
 738 binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 739 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 740 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
 741
 742 # Saturated vector add for 4 8bit ints.
 743 binop("usadd_4x8", tint32, _2src_commutative + associative, """
 744 dst = 0;
 745 for (int i = 0; i < 32; i += 8) {
 746    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
 747 }
 748 """)
 749
 750 # Saturated vector subtract for 4 8bit ints.
 751 binop("ussub_4x8", tint32, "", """
 752 dst = 0;
 753 for (int i = 0; i < 32; i += 8) {
 754    int src0_chan = (src0 >> i) & 0xff;
 755    int src1_chan = (src1 >> i) & 0xff;
 756    if (src0_chan > src1_chan)
 757       dst |= (src0_chan - src1_chan) << i;
 758 }
 759 """)
 760
 761 # vector min for 4 8bit ints.
 762 binop("umin_4x8", tint32, _2src_commutative + associative, """
 763 dst = 0;
 764 for (int i = 0; i < 32; i += 8) {
 765    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 766 }
 767 """)
 768
 769 # vector max for 4 8bit ints.
 770 binop("umax_4x8", tint32, _2src_commutative + associative, """
 771 dst = 0;
 772 for (int i = 0; i < 32; i += 8) {
 773    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
 774 }
 775 """)
 776
 777 # unorm multiply: (a * b) / 255.
 778 binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
 779 dst = 0;
 780 for (int i = 0; i < 32; i += 8) {
 781    int src0_chan = (src0 >> i) & 0xff;
 782    int src1_chan = (src1 >> i) & 0xff;
 783    dst |= ((src0_chan * src1_chan) / 255) << i;
 784 }
 785 """)
 786
 787 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 788
 789 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
 790             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 791
 792 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 793               "src0 | ((uint64_t)src1 << 32)")
 794
 795 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
 796               "src0 | ((uint32_t)src1 << 16)")
 797
 798 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 799 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 800 # are from the low five bits of src0 and src1, respectively.
 801 binop_convert("bfm", tuint32, tint32, "", """
 802 int bits = src0 & 0x1F;
 803 int offset = src1 & 0x1F;
 804 dst = ((1u << bits) - 1) << offset;
 805 """)
 806
 807 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
 808 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 809 /* flush denormals to zero. */
 810 if (!isnormal(dst))
 811    dst = copysignf(0.0f, src0);
 812 """)
 813
 814 # Combines the first component of each input to make a 2-component vector.
 815
 816 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
 817 dst.x = src0.x;
 818 dst.y = src1.x;
 819 """)
 820
 821 # Byte extraction
 822 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
 823 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
 824
 825 # Word extraction
 826 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
 827 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
 828
 829
 830 def triop(name, ty, alg_props, const_expr):
 831    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
 832 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
 833    opcode(name, output_size, tuint,
 834    [src1_size, src2_size, src3_size],
 835    [tuint, tuint, tuint], False, "", const_expr)
 836
 837 triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2")
 838
 839 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
 840
 841 # Conditional Select
 842 #
 843 # A vector conditional select instruction (like ?:, but operating per-
 844 # component on vectors). There are two versions, one for floating point
 845 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 846
 847
 848 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
 849
 850 # 3 way min/max/med
 851 triop("fmin3", tfloat, "", "fminf(src0, fminf(src1, src2))")
 852 triop("imin3", tint, "", "MIN2(src0, MIN2(src1, src2))")
 853 triop("umin3", tuint, "", "MIN2(src0, MIN2(src1, src2))")
 854
 855 triop("fmax3", tfloat, "", "fmaxf(src0, fmaxf(src1, src2))")
 856 triop("imax3", tint, "", "MAX2(src0, MAX2(src1, src2))")
 857 triop("umax3", tuint, "", "MAX2(src0, MAX2(src1, src2))")
 858
 859 triop("fmed3", tfloat, "", "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
 860 triop("imed3", tint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 861 triop("umed3", tuint, "", "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
 862
 863 opcode("bcsel", 0, tuint, [0, 0, 0],
 864       [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
 865 opcode("b32csel", 0, tuint, [0, 0, 0],
 866        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
 867
 868 # SM5 bfi assembly
 869 triop("bfi", tuint32, "", """
 870 unsigned mask = src0, insert = src1, base = src2;
 871 if (mask == 0) {
 872    dst = base;
 873 } else {
 874    unsigned tmp = mask;
 875    while (!(tmp & 1)) {
 876       tmp >>= 1;
 877       insert <<= 1;
 878    }
 879    dst = (base & ~mask) | (insert & mask);
 880 }
 881 """)
 882
 883
 884 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
 885
 886 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 887 opcode("ubfe", 0, tuint32,
 888        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 889 unsigned base = src0;
 890 unsigned offset = src1 & 0x1F;
 891 unsigned bits = src2 & 0x1F;
 892 if (bits == 0) {
 893    dst = 0;
 894 } else if (offset + bits < 32) {
 895    dst = (base << (32 - bits - offset)) >> (32 - bits);
 896 } else {
 897    dst = base >> offset;
 898 }
 899 """)
 900 opcode("ibfe", 0, tint32,
 901        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
 902 int base = src0;
 903 unsigned offset = src1 & 0x1F;
 904 unsigned bits = src2 & 0x1F;
 905 if (bits == 0) {
 906    dst = 0;
 907 } else if (offset + bits < 32) {
 908    dst = (base << (32 - bits - offset)) >> (32 - bits);
 909 } else {
 910    dst = base >> offset;
 911 }
 912 """)
 913
 914 # GLSL bitfieldExtract()
 915 opcode("ubitfield_extract", 0, tuint32,
 916        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
 917 unsigned base = src0;
 918 int offset = src1, bits = src2;
 919 if (bits == 0) {
 920    dst = 0;
 921 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
 922    dst = 0; /* undefined per the spec */
 923 } else {
 924    dst = (base >> offset) & ((1ull << bits) - 1);
 925 }
 926 """)
 927 opcode("ibitfield_extract", 0, tint32,
 928        [0, 0, 0], [tint32, tint32, tint32], False, "", """
 929 int base = src0;
 930 int offset = src1, bits = src2;
 931 if (bits == 0) {
 932    dst = 0;
 933 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
 934    dst = 0;
 935 } else {
 936    dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
 937 }
 938 """)
 939
 940 # Combines the first component of each input to make a 3-component vector.
 941
 942 triop_horiz("vec3", 3, 1, 1, 1, """
 943 dst.x = src0.x;
 944 dst.y = src1.x;
 945 dst.z = src2.x;
 946 """)
 947
 948 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
 949                  src4_size, const_expr):
 950    opcode(name, output_size, tuint,
 951           [src1_size, src2_size, src3_size, src4_size],
 952           [tuint, tuint, tuint, tuint],
 953           False, "", const_expr)
 954
 955 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
 956        [tuint32, tuint32, tint32, tint32], False, "", """
 957 unsigned base = src0, insert = src1;
 958 int offset = src2, bits = src3;
 959 if (bits == 0) {
 960    dst = base;
 961 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
 962    dst = 0;
 963 } else {
 964    unsigned mask = ((1ull << bits) - 1) << offset;
 965    dst = (base & ~mask) | ((insert << offset) & mask);
 966 }
 967 """)
 968
 969 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
 970 dst.x = src0.x;
 971 dst.y = src1.x;
 972 dst.z = src2.x;
 973 dst.w = src3.x;
 974 """)
 975
 976 # ir3-specific instruction that maps directly to mul-add shift high mix,
 977 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
 978 # multiplication (imul) on Freedreno backend..
 979 opcode("imadsh_mix16", 1, tint32,
 980        [1, 1, 1], [tint32, tint32, tint32], False, "", """
 981 dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
 982 """)