src/compiler/nir/nir_lower_int64.c

   1 /*
   2  * Copyright © 2016 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir.h"
  25 #include "nir_builder.h"
  26
  27 #define COND_LOWER_OP(b, name, ...)                                   \
  28         (b->shader->options->lower_int64_options &                    \
  29          nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
  30         lower_##name##64(b, __VA_ARGS__) : nir_##name(b, __VA_ARGS__)
  31
  32 #define COND_LOWER_CMP(b, name, ...)                                  \
  33         (b->shader->options->lower_int64_options &                    \
  34          nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
  35         lower_int64_compare(b, nir_op_##name, __VA_ARGS__) :          \
  36         nir_##name(b, __VA_ARGS__)
  37
  38 #define COND_LOWER_CAST(b, name, ...)                                 \
  39         (b->shader->options->lower_int64_options &                    \
  40          nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
  41         lower_##name(b, __VA_ARGS__) :                                \
  42         nir_##name(b, __VA_ARGS__)
  43
  44 static nir_ssa_def *
  45 lower_b2i64(nir_builder *b, nir_ssa_def *x)
  46 {
  47    return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0));
  48 }
  49
  50 static nir_ssa_def *
  51 lower_i2b(nir_builder *b, nir_ssa_def *x)
  52 {
  53    return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x),
  54                                 nir_unpack_64_2x32_split_y(b, x)),
  55                      nir_imm_int(b, 0));
  56 }
  57
  58 static nir_ssa_def *
  59 lower_i2i8(nir_builder *b, nir_ssa_def *x)
  60 {
  61    return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x));
  62 }
  63
  64 static nir_ssa_def *
  65 lower_i2i16(nir_builder *b, nir_ssa_def *x)
  66 {
  67    return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x));
  68 }
  69
  70
  71 static nir_ssa_def *
  72 lower_i2i32(nir_builder *b, nir_ssa_def *x)
  73 {
  74    return nir_unpack_64_2x32_split_x(b, x);
  75 }
  76
  77 static nir_ssa_def *
  78 lower_i2i64(nir_builder *b, nir_ssa_def *x)
  79 {
  80    nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x);
  81    return nir_pack_64_2x32_split(b, x32, nir_ishr(b, x32, nir_imm_int(b, 31)));
  82 }
  83
  84 static nir_ssa_def *
  85 lower_u2u8(nir_builder *b, nir_ssa_def *x)
  86 {
  87    return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x));
  88 }
  89
  90 static nir_ssa_def *
  91 lower_u2u16(nir_builder *b, nir_ssa_def *x)
  92 {
  93    return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x));
  94 }
  95
  96 static nir_ssa_def *
  97 lower_u2u32(nir_builder *b, nir_ssa_def *x)
  98 {
  99    return nir_unpack_64_2x32_split_x(b, x);
 100 }
 101
 102 static nir_ssa_def *
 103 lower_u2u64(nir_builder *b, nir_ssa_def *x)
 104 {
 105    nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x);
 106    return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0));
 107 }
 108
 109 static nir_ssa_def *
 110 lower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y)
 111 {
 112    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 113    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 114    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 115    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 116
 117    return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo),
 118                                     nir_bcsel(b, cond, x_hi, y_hi));
 119 }
 120
 121 static nir_ssa_def *
 122 lower_inot64(nir_builder *b, nir_ssa_def *x)
 123 {
 124    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 125    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 126
 127    return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi));
 128 }
 129
 130 static nir_ssa_def *
 131 lower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 132 {
 133    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 134    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 135    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 136    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 137
 138    return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo),
 139                                     nir_iand(b, x_hi, y_hi));
 140 }
 141
 142 static nir_ssa_def *
 143 lower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 144 {
 145    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 146    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 147    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 148    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 149
 150    return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo),
 151                                     nir_ior(b, x_hi, y_hi));
 152 }
 153
 154 static nir_ssa_def *
 155 lower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 156 {
 157    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 158    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 159    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 160    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 161
 162    return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo),
 163                                     nir_ixor(b, x_hi, y_hi));
 164 }
 165
 166 static nir_ssa_def *
 167 lower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 168 {
 169    /* Implemented as
 170     *
 171     * uint64_t lshift(uint64_t x, int c)
 172     * {
 173     *    if (c == 0) return x;
 174     *
 175     *    uint32_t lo = LO(x), hi = HI(x);
 176     *
 177     *    if (c < 32) {
 178     *       uint32_t lo_shifted = lo << c;
 179     *       uint32_t hi_shifted = hi << c;
 180     *       uint32_t lo_shifted_hi = lo >> abs(32 - c);
 181     *       return pack_64(lo_shifted, hi_shifted | lo_shifted_hi);
 182     *    } else {
 183     *       uint32_t lo_shifted_hi = lo << abs(32 - c);
 184     *       return pack_64(0, lo_shifted_hi);
 185     *    }
 186     * }
 187     */
 188    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 189    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 190
 191    nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
 192    nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y);
 193    nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y);
 194    nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count);
 195
 196    nir_ssa_def *res_if_lt_32 =
 197       nir_pack_64_2x32_split(b, lo_shifted,
 198                                 nir_ior(b, hi_shifted, lo_shifted_hi));
 199    nir_ssa_def *res_if_ge_32 =
 200       nir_pack_64_2x32_split(b, nir_imm_int(b, 0),
 201                                 nir_ishl(b, x_lo, reverse_count));
 202
 203    return nir_bcsel(b,
 204                     nir_ieq(b, y, nir_imm_int(b, 0)), x,
 205                     nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
 206                                  res_if_ge_32, res_if_lt_32));
 207 }
 208
 209 static nir_ssa_def *
 210 lower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 211 {
 212    /* Implemented as
 213     *
 214     * uint64_t arshift(uint64_t x, int c)
 215     * {
 216     *    if (c == 0) return x;
 217     *
 218     *    uint32_t lo = LO(x);
 219     *    int32_t  hi = HI(x);
 220     *
 221     *    if (c < 32) {
 222     *       uint32_t lo_shifted = lo >> c;
 223     *       uint32_t hi_shifted = hi >> c;
 224     *       uint32_t hi_shifted_lo = hi << abs(32 - c);
 225     *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
 226     *    } else {
 227     *       uint32_t hi_shifted = hi >> 31;
 228     *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
 229     *       return pack_64(hi_shifted, hi_shifted_lo);
 230     *    }
 231     * }
 232     */
 233    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 234    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 235
 236    nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
 237    nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
 238    nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y);
 239    nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
 240
 241    nir_ssa_def *res_if_lt_32 =
 242       nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
 243                                 hi_shifted);
 244    nir_ssa_def *res_if_ge_32 =
 245       nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count),
 246                                 nir_ishr(b, x_hi, nir_imm_int(b, 31)));
 247
 248    return nir_bcsel(b,
 249                     nir_ieq(b, y, nir_imm_int(b, 0)), x,
 250                     nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
 251                                  res_if_ge_32, res_if_lt_32));
 252 }
 253
 254 static nir_ssa_def *
 255 lower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 256 {
 257    /* Implemented as
 258     *
 259     * uint64_t rshift(uint64_t x, int c)
 260     * {
 261     *    if (c == 0) return x;
 262     *
 263     *    uint32_t lo = LO(x), hi = HI(x);
 264     *
 265     *    if (c < 32) {
 266     *       uint32_t lo_shifted = lo >> c;
 267     *       uint32_t hi_shifted = hi >> c;
 268     *       uint32_t hi_shifted_lo = hi << abs(32 - c);
 269     *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
 270     *    } else {
 271     *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
 272     *       return pack_64(0, hi_shifted_lo);
 273     *    }
 274     * }
 275     */
 276
 277    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 278    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 279
 280    nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
 281    nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
 282    nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y);
 283    nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
 284
 285    nir_ssa_def *res_if_lt_32 =
 286       nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
 287                                 hi_shifted);
 288    nir_ssa_def *res_if_ge_32 =
 289       nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count),
 290                                 nir_imm_int(b, 0));
 291
 292    return nir_bcsel(b,
 293                     nir_ieq(b, y, nir_imm_int(b, 0)), x,
 294                     nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
 295                                  res_if_ge_32, res_if_lt_32));
 296 }
 297
 298 static nir_ssa_def *
 299 lower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 300 {
 301    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 302    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 303    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 304    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 305
 306    nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo);
 307    nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo));
 308    nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi));
 309
 310    return nir_pack_64_2x32_split(b, res_lo, res_hi);
 311 }
 312
 313 static nir_ssa_def *
 314 lower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 315 {
 316    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 317    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 318    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 319    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 320
 321    nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo);
 322    nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo)));
 323    nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow);
 324
 325    return nir_pack_64_2x32_split(b, res_lo, res_hi);
 326 }
 327
 328 static nir_ssa_def *
 329 lower_ineg64(nir_builder *b, nir_ssa_def *x)
 330 {
 331    /* Since isub is the same number of instructions (with better dependencies)
 332     * as iadd, subtraction is actually more efficient for ineg than the usual
 333     * 2's complement "flip the bits and add one".
 334     */
 335    return lower_isub64(b, nir_imm_int64(b, 0), x);
 336 }
 337
 338 static nir_ssa_def *
 339 lower_iabs64(nir_builder *b, nir_ssa_def *x)
 340 {
 341    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 342    nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0));
 343    return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x);
 344 }
 345
 346 static nir_ssa_def *
 347 lower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y)
 348 {
 349    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 350    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 351    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 352    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 353
 354    switch (op) {
 355    case nir_op_ieq:
 356       return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo));
 357    case nir_op_ine:
 358       return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo));
 359    case nir_op_ult:
 360       return nir_ior(b, nir_ult(b, x_hi, y_hi),
 361                         nir_iand(b, nir_ieq(b, x_hi, y_hi),
 362                                     nir_ult(b, x_lo, y_lo)));
 363    case nir_op_ilt:
 364       return nir_ior(b, nir_ilt(b, x_hi, y_hi),
 365                         nir_iand(b, nir_ieq(b, x_hi, y_hi),
 366                                     nir_ult(b, x_lo, y_lo)));
 367       break;
 368    case nir_op_uge:
 369       /* Lower as !(x < y) in the hopes of better CSE */
 370       return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y));
 371    case nir_op_ige:
 372       /* Lower as !(x < y) in the hopes of better CSE */
 373       return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y));
 374    default:
 375       unreachable("Invalid comparison");
 376    }
 377 }
 378
 379 static nir_ssa_def *
 380 lower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 381 {
 382    return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x);
 383 }
 384
 385 static nir_ssa_def *
 386 lower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 387 {
 388    return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x);
 389 }
 390
 391 static nir_ssa_def *
 392 lower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 393 {
 394    return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y);
 395 }
 396
 397 static nir_ssa_def *
 398 lower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 399 {
 400    return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y);
 401 }
 402
 403 static nir_ssa_def *
 404 lower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
 405                   bool sign_extend)
 406 {
 407    nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y)
 408                                      : nir_umul_high(b, x, y);
 409
 410    return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi);
 411 }
 412
 413 static nir_ssa_def *
 414 lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
 415 {
 416    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 417    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 418    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
 419    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
 420
 421    nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo);
 422    nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo),
 423                          nir_iadd(b, nir_imul(b, x_lo, y_hi),
 424                                      nir_imul(b, x_hi, y_lo)));
 425
 426    return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo),
 427                                  res_hi);
 428 }
 429
 430 static nir_ssa_def *
 431 lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
 432                  bool sign_extend)
 433 {
 434    nir_ssa_def *x32[4], *y32[4];
 435    x32[0] = nir_unpack_64_2x32_split_x(b, x);
 436    x32[1] = nir_unpack_64_2x32_split_y(b, x);
 437    if (sign_extend) {
 438       x32[2] = x32[3] = nir_ishr(b, x32[1], nir_imm_int(b, 31));
 439    } else {
 440       x32[2] = x32[3] = nir_imm_int(b, 0);
 441    }
 442
 443    y32[0] = nir_unpack_64_2x32_split_x(b, y);
 444    y32[1] = nir_unpack_64_2x32_split_y(b, y);
 445    if (sign_extend) {
 446       y32[2] = y32[3] = nir_ishr(b, y32[1], nir_imm_int(b, 31));
 447    } else {
 448       y32[2] = y32[3] = nir_imm_int(b, 0);
 449    }
 450
 451    nir_ssa_def *res[8] = { NULL, };
 452
 453    /* Yes, the following generates a pile of code.  However, we throw res[0]
 454     * and res[1] away in the end and, if we're in the umul case, four of our
 455     * eight dword operands will be constant zero and opt_algebraic will clean
 456     * this up nicely.
 457     */
 458    for (unsigned i = 0; i < 4; i++) {
 459       nir_ssa_def *carry = NULL;
 460       for (unsigned j = 0; j < 4; j++) {
 461          /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
 462           * maximum value of tmp is UINT32_MAX * UINT32_MAX.  The maximum
 463           * value that will fit in tmp is
 464           *
 465           *    UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
 466           *               = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
 467           *               = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
 468           *
 469           * so we're guaranteed that we can add in two more 32-bit values
 470           * without overflowing tmp.
 471           */
 472          nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]);
 473
 474          if (res[i + j])
 475             tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
 476          if (carry)
 477             tmp = nir_iadd(b, tmp, carry);
 478          res[i + j] = nir_u2u32(b, tmp);
 479          carry = nir_ushr(b, tmp, nir_imm_int(b, 32));
 480       }
 481       res[i + 4] = nir_u2u32(b, carry);
 482    }
 483
 484    return nir_pack_64_2x32_split(b, res[2], res[3]);
 485 }
 486
 487 static nir_ssa_def *
 488 lower_isign64(nir_builder *b, nir_ssa_def *x)
 489 {
 490    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 491    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 492
 493    nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi));
 494    nir_ssa_def *res_hi = nir_ishr(b, x_hi, nir_imm_int(b, 31));
 495    nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero));
 496
 497    return nir_pack_64_2x32_split(b, res_lo, res_hi);
 498 }
 499
 500 static void
 501 lower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d,
 502                    nir_ssa_def **q, nir_ssa_def **r)
 503 {
 504    /* TODO: We should specially handle the case where the denominator is a
 505     * constant.  In that case, we should be able to reduce it to a multiply by
 506     * a constant, some shifts, and an add.
 507     */
 508    nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n);
 509    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
 510    nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d);
 511    nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
 512
 513    nir_ssa_def *q_lo = nir_imm_zero(b, n->num_components, 32);
 514    nir_ssa_def *q_hi = nir_imm_zero(b, n->num_components, 32);
 515
 516    nir_ssa_def *n_hi_before_if = n_hi;
 517    nir_ssa_def *q_hi_before_if = q_hi;
 518
 519    /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
 520     * greater than 32 bits to occur.  If the upper 32 bits of the numerator
 521     * are zero, it is impossible for (denom << [63, 32]) <= numer unless
 522     * denom == 0.
 523     */
 524    nir_ssa_def *need_high_div =
 525       nir_iand(b, nir_ieq(b, d_hi, nir_imm_int(b, 0)), nir_uge(b, n_hi, d_lo));
 526    nir_push_if(b, nir_bany(b, need_high_div));
 527    {
 528       /* If we only have one component, then the bany above goes away and
 529        * this is always true within the if statement.
 530        */
 531       if (n->num_components == 1)
 532          need_high_div = nir_imm_true(b);
 533
 534       nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo);
 535
 536       for (int i = 31; i >= 0; i--) {
 537          /* if ((d.x << i) <= n.y) {
 538           *    n.y -= d.x << i;
 539           *    quot.y |= 1U << i;
 540           * }
 541           */
 542          nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i));
 543          nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift);
 544          nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i));
 545          nir_ssa_def *cond = nir_iand(b, need_high_div,
 546                                          nir_uge(b, n_hi, d_shift));
 547          if (i != 0) {
 548             /* log2_d_lo is always <= 31, so we don't need to bother with it
 549              * in the last iteration.
 550              */
 551             cond = nir_iand(b, cond,
 552                                nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo));
 553          }
 554          n_hi = nir_bcsel(b, cond, new_n_hi, n_hi);
 555          q_hi = nir_bcsel(b, cond, new_q_hi, q_hi);
 556       }
 557    }
 558    nir_pop_if(b, NULL);
 559    n_hi = nir_if_phi(b, n_hi, n_hi_before_if);
 560    q_hi = nir_if_phi(b, q_hi, q_hi_before_if);
 561
 562    nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi);
 563
 564    n = nir_pack_64_2x32_split(b, n_lo, n_hi);
 565    d = nir_pack_64_2x32_split(b, d_lo, d_hi);
 566    for (int i = 31; i >= 0; i--) {
 567       /* if ((d64 << i) <= n64) {
 568        *    n64 -= d64 << i;
 569        *    quot.x |= 1U << i;
 570        * }
 571        */
 572       nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i));
 573       nir_ssa_def *new_n = nir_isub(b, n, d_shift);
 574       nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i));
 575       nir_ssa_def *cond = nir_uge(b, n, d_shift);
 576       if (i != 0) {
 577          /* log2_denom is always <= 31, so we don't need to bother with it
 578           * in the last iteration.
 579           */
 580          cond = nir_iand(b, cond,
 581                             nir_ige(b, nir_imm_int(b, 31 - i), log2_denom));
 582       }
 583       n = nir_bcsel(b, cond, new_n, n);
 584       q_lo = nir_bcsel(b, cond, new_q_lo, q_lo);
 585    }
 586
 587    *q = nir_pack_64_2x32_split(b, q_lo, q_hi);
 588    *r = n;
 589 }
 590
 591 static nir_ssa_def *
 592 lower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
 593 {
 594    nir_ssa_def *q, *r;
 595    lower_udiv64_mod64(b, n, d, &q, &r);
 596    return q;
 597 }
 598
 599 static nir_ssa_def *
 600 lower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
 601 {
 602    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
 603    nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
 604
 605    nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)),
 606                                     nir_ilt(b, d_hi, nir_imm_int(b, 0)));
 607    nir_ssa_def *q, *r;
 608    lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
 609    return nir_bcsel(b, negate, nir_ineg(b, q), q);
 610 }
 611
 612 static nir_ssa_def *
 613 lower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
 614 {
 615    nir_ssa_def *q, *r;
 616    lower_udiv64_mod64(b, n, d, &q, &r);
 617    return r;
 618 }
 619
 620 static nir_ssa_def *
 621 lower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
 622 {
 623    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
 624    nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
 625    nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
 626    nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0));
 627
 628    nir_ssa_def *q, *r;
 629    lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
 630
 631    nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
 632
 633    return nir_bcsel(b, nir_ieq(b, r, nir_imm_int64(b, 0)), nir_imm_int64(b, 0),
 634           nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem,
 635                        nir_iadd(b, rem, d)));
 636 }
 637
 638 static nir_ssa_def *
 639 lower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
 640 {
 641    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
 642    nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
 643
 644    nir_ssa_def *q, *r;
 645    lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
 646    return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
 647 }
 648
 649 static nir_ssa_def *
 650 lower_extract(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *c)
 651 {
 652    assert(op == nir_op_extract_u8 || op == nir_op_extract_i8 ||
 653           op == nir_op_extract_u16 || op == nir_op_extract_i16);
 654
 655    const int chunk = nir_src_as_uint(nir_src_for_ssa(c));
 656    const int chunk_bits =
 657       (op == nir_op_extract_u8 || op == nir_op_extract_i8) ? 8 : 16;
 658    const int num_chunks_in_32 = 32 / chunk_bits;
 659
 660    nir_ssa_def *extract32;
 661    if (chunk < num_chunks_in_32) {
 662       extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_x(b, x),
 663                                    nir_imm_int(b, chunk),
 664                                    NULL, NULL);
 665    } else {
 666       extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_y(b, x),
 667                                    nir_imm_int(b, chunk - num_chunks_in_32),
 668                                    NULL, NULL);
 669    }
 670
 671    if (op == nir_op_extract_i8 || op == nir_op_extract_i16)
 672       return lower_i2i64(b, extract32);
 673    else
 674       return lower_u2u64(b, extract32);
 675 }
 676
 677 static nir_ssa_def *
 678 lower_ufind_msb64(nir_builder *b, nir_ssa_def *x)
 679 {
 680
 681    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
 682    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
 683    nir_ssa_def *lo_count = nir_ufind_msb(b, x_lo);
 684    nir_ssa_def *hi_count = nir_ufind_msb(b, x_hi);
 685    nir_ssa_def *valid_hi_bits = nir_ine(b, x_hi, nir_imm_int(b, 0));
 686    nir_ssa_def *hi_res = nir_iadd(b, nir_imm_intN_t(b, 32, 32), hi_count);
 687    return nir_bcsel(b, valid_hi_bits, hi_res, lo_count);
 688 }
 689
 690 static nir_ssa_def *
 691 lower_2f(nir_builder *b, nir_ssa_def *x, unsigned dest_bit_size,
 692          bool src_is_signed)
 693 {
 694    nir_ssa_def *x_sign = NULL;
 695
 696    if (src_is_signed) {
 697       x_sign = nir_bcsel(b, COND_LOWER_CMP(b, ilt, x, nir_imm_int64(b, 0)),
 698                          nir_imm_floatN_t(b, -1, dest_bit_size),
 699                          nir_imm_floatN_t(b, 1, dest_bit_size));
 700       x = COND_LOWER_OP(b, iabs, x);
 701    }
 702
 703    nir_ssa_def *exp = COND_LOWER_OP(b, ufind_msb, x);
 704    unsigned significand_bits;
 705
 706    switch (dest_bit_size) {
 707    case 32:
 708       significand_bits = 23;
 709       break;
 710    case 16:
 711       significand_bits = 10;
 712       break;
 713    default:
 714       unreachable("Invalid dest_bit_size");
 715    }
 716
 717    /* We keep one more bit than can fit in the significand field to let the
 718     * u2f32 conversion do the rounding for us.
 719     */
 720    nir_ssa_def *discard =
 721       nir_imax(b, nir_isub(b, exp, nir_imm_int(b, significand_bits + 1)),
 722                   nir_imm_int(b, 0));
 723
 724    /* Part of the "round to nearest" has to be taken care of before we discard
 725     * the LSB, and that's what this extra iadd is for.
 726     * "Round to nearest even" is handled by u2f. That works because the
 727     * shifted value either fits in the significand field (which means no
 728     * rounding is required) or contains one extra bit that forces the
 729     * conversion op to round things properly.
 730     */
 731    nir_ssa_def *add = COND_LOWER_OP(b, ishl, nir_imm_int64(b, 1), discard);
 732    add = COND_LOWER_OP(b, isub, add, nir_imm_int64(b, 1));
 733    nir_ssa_def *rounded_x = COND_LOWER_OP(b, iadd, x, add);
 734
 735    /* Signed Values can't overflow because we've saved the sign and promoted
 736     * them to unsigned values.
 737     */
 738    if (!src_is_signed) {
 739       nir_ssa_def *overflow = COND_LOWER_CMP(b, ult, rounded_x, x);
 740       rounded_x = COND_LOWER_OP(b, bcsel, overflow,
 741                                 nir_imm_int64(b, UINT64_MAX), rounded_x);
 742    }
 743
 744    nir_ssa_def *significand = COND_LOWER_OP(b, ushr, rounded_x, discard);
 745    significand = COND_LOWER_CAST(b, u2u32, significand);
 746
 747    nir_ssa_def *res;
 748
 749    if (dest_bit_size == 32)
 750       res = nir_fmul(b, nir_u2f32(b, significand),
 751                      nir_fexp2(b, nir_u2f32(b, discard)));
 752    else
 753       res = nir_fmul(b, nir_u2f16(b, significand),
 754                      nir_fexp2(b, nir_u2f16(b, discard)));
 755
 756    if (src_is_signed)
 757       res = nir_fmul(b, res, x_sign);
 758
 759    return res;
 760 }
 761
 762 static nir_ssa_def *
 763 lower_f2(nir_builder *b, nir_ssa_def *x, bool dst_is_signed)
 764 {
 765    assert(x->bit_size == 16 || x->bit_size == 32);
 766    nir_ssa_def *x_sign = NULL;
 767
 768    if (dst_is_signed)
 769       x_sign = nir_fsign(b, x);
 770    else
 771       x = nir_fmin(b, x, nir_imm_floatN_t(b, UINT64_MAX, x->bit_size));
 772
 773    x = nir_ftrunc(b, x);
 774
 775    if (dst_is_signed) {
 776       x = nir_fmin(b, x, nir_imm_floatN_t(b, INT64_MAX, x->bit_size));
 777       x = nir_fmax(b, x, nir_imm_floatN_t(b, INT64_MIN, x->bit_size));
 778       x = nir_fabs(b, x);
 779    }
 780
 781    nir_ssa_def *div = nir_imm_floatN_t(b, 1ULL << 32, x->bit_size);
 782    nir_ssa_def *res_hi = nir_f2u32(b, nir_fdiv(b, x, div));
 783    nir_ssa_def *res_lo = nir_f2u32(b, nir_frem(b, x, div));
 784    nir_ssa_def *res = nir_pack_64_2x32_split(b, res_lo, res_hi);
 785
 786    if (dst_is_signed)
 787       res = nir_bcsel(b, nir_flt(b, x_sign, nir_imm_float(b, 0)),
 788                       nir_ineg(b, res), res);
 789
 790    return res;
 791 }
 792
 793 nir_lower_int64_options
 794 nir_lower_int64_op_to_options_mask(nir_op opcode)
 795 {
 796    switch (opcode) {
 797    case nir_op_imul:
 798    case nir_op_amul:
 799       return nir_lower_imul64;
 800    case nir_op_imul_2x32_64:
 801    case nir_op_umul_2x32_64:
 802       return nir_lower_imul_2x32_64;
 803    case nir_op_imul_high:
 804    case nir_op_umul_high:
 805       return nir_lower_imul_high64;
 806    case nir_op_isign:
 807       return nir_lower_isign64;
 808    case nir_op_udiv:
 809    case nir_op_idiv:
 810    case nir_op_umod:
 811    case nir_op_imod:
 812    case nir_op_irem:
 813       return nir_lower_divmod64;
 814    case nir_op_b2i64:
 815    case nir_op_i2b1:
 816    case nir_op_i2i8:
 817    case nir_op_i2i16:
 818    case nir_op_i2i32:
 819    case nir_op_i2i64:
 820    case nir_op_u2u8:
 821    case nir_op_u2u16:
 822    case nir_op_u2u32:
 823    case nir_op_u2u64:
 824    case nir_op_i2f32:
 825    case nir_op_u2f32:
 826    case nir_op_i2f16:
 827    case nir_op_u2f16:
 828    case nir_op_f2i64:
 829    case nir_op_f2u64:
 830    case nir_op_bcsel:
 831       return nir_lower_mov64;
 832    case nir_op_ieq:
 833    case nir_op_ine:
 834    case nir_op_ult:
 835    case nir_op_ilt:
 836    case nir_op_uge:
 837    case nir_op_ige:
 838       return nir_lower_icmp64;
 839    case nir_op_iadd:
 840    case nir_op_isub:
 841       return nir_lower_iadd64;
 842    case nir_op_imin:
 843    case nir_op_imax:
 844    case nir_op_umin:
 845    case nir_op_umax:
 846    case nir_op_imin3:
 847    case nir_op_imax3:
 848    case nir_op_umin3:
 849    case nir_op_umax3:
 850    case nir_op_imed3:
 851    case nir_op_umed3:
 852       return nir_lower_minmax64;
 853    case nir_op_iabs:
 854       return nir_lower_iabs64;
 855    case nir_op_ineg:
 856       return nir_lower_ineg64;
 857    case nir_op_iand:
 858    case nir_op_ior:
 859    case nir_op_ixor:
 860    case nir_op_inot:
 861       return nir_lower_logic64;
 862    case nir_op_ishl:
 863    case nir_op_ishr:
 864    case nir_op_ushr:
 865       return nir_lower_shift64;
 866    case nir_op_extract_u8:
 867    case nir_op_extract_i8:
 868    case nir_op_extract_u16:
 869    case nir_op_extract_i16:
 870       return nir_lower_extract64;
 871    case nir_op_ufind_msb:
 872       return nir_lower_ufind_msb64;
 873    default:
 874       return 0;
 875    }
 876 }
 877
 878 static nir_ssa_def *
 879 lower_int64_alu_instr(nir_builder *b, nir_instr *instr, void *_state)
 880 {
 881    nir_alu_instr *alu = nir_instr_as_alu(instr);
 882
 883    nir_ssa_def *src[4];
 884    for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
 885       src[i] = nir_ssa_for_alu_src(b, alu, i);
 886
 887    switch (alu->op) {
 888    case nir_op_imul:
 889    case nir_op_amul:
 890       return lower_imul64(b, src[0], src[1]);
 891    case nir_op_imul_2x32_64:
 892       return lower_mul_2x32_64(b, src[0], src[1], true);
 893    case nir_op_umul_2x32_64:
 894       return lower_mul_2x32_64(b, src[0], src[1], false);
 895    case nir_op_imul_high:
 896       return lower_mul_high64(b, src[0], src[1], true);
 897    case nir_op_umul_high:
 898       return lower_mul_high64(b, src[0], src[1], false);
 899    case nir_op_isign:
 900       return lower_isign64(b, src[0]);
 901    case nir_op_udiv:
 902       return lower_udiv64(b, src[0], src[1]);
 903    case nir_op_idiv:
 904       return lower_idiv64(b, src[0], src[1]);
 905    case nir_op_umod:
 906       return lower_umod64(b, src[0], src[1]);
 907    case nir_op_imod:
 908       return lower_imod64(b, src[0], src[1]);
 909    case nir_op_irem:
 910       return lower_irem64(b, src[0], src[1]);
 911    case nir_op_b2i64:
 912       return lower_b2i64(b, src[0]);
 913    case nir_op_i2b1:
 914       return lower_i2b(b, src[0]);
 915    case nir_op_i2i8:
 916       return lower_i2i8(b, src[0]);
 917    case nir_op_i2i16:
 918       return lower_i2i16(b, src[0]);
 919    case nir_op_i2i32:
 920       return lower_i2i32(b, src[0]);
 921    case nir_op_i2i64:
 922       return lower_i2i64(b, src[0]);
 923    case nir_op_u2u8:
 924       return lower_u2u8(b, src[0]);
 925    case nir_op_u2u16:
 926       return lower_u2u16(b, src[0]);
 927    case nir_op_u2u32:
 928       return lower_u2u32(b, src[0]);
 929    case nir_op_u2u64:
 930       return lower_u2u64(b, src[0]);
 931    case nir_op_bcsel:
 932       return lower_bcsel64(b, src[0], src[1], src[2]);
 933    case nir_op_ieq:
 934    case nir_op_ine:
 935    case nir_op_ult:
 936    case nir_op_ilt:
 937    case nir_op_uge:
 938    case nir_op_ige:
 939       return lower_int64_compare(b, alu->op, src[0], src[1]);
 940    case nir_op_iadd:
 941       return lower_iadd64(b, src[0], src[1]);
 942    case nir_op_isub:
 943       return lower_isub64(b, src[0], src[1]);
 944    case nir_op_imin:
 945       return lower_imin64(b, src[0], src[1]);
 946    case nir_op_imax:
 947       return lower_imax64(b, src[0], src[1]);
 948    case nir_op_umin:
 949       return lower_umin64(b, src[0], src[1]);
 950    case nir_op_umax:
 951       return lower_umax64(b, src[0], src[1]);
 952    case nir_op_imin3:
 953       return lower_imin64(b, src[0], lower_imin64(b, src[1], src[2]));
 954    case nir_op_imax3:
 955       return lower_imax64(b, src[0], lower_imax64(b, src[1], src[2]));
 956    case nir_op_umin3:
 957       return lower_umin64(b, src[0], lower_umin64(b, src[1], src[2]));
 958    case nir_op_umax3:
 959       return lower_umax64(b, src[0], lower_umax64(b, src[1], src[2]));
 960    case nir_op_imed3:
 961       return lower_imax64(b, lower_imin64(b, lower_imax64(b, src[0], src[1]), src[2]), lower_imin64(b, src[0], src[1]));
 962    case nir_op_umed3:
 963       return lower_umax64(b, lower_umin64(b, lower_umax64(b, src[0], src[1]), src[2]), lower_umin64(b, src[0], src[1]));
 964    case nir_op_iabs:
 965       return lower_iabs64(b, src[0]);
 966    case nir_op_ineg:
 967       return lower_ineg64(b, src[0]);
 968    case nir_op_iand:
 969       return lower_iand64(b, src[0], src[1]);
 970    case nir_op_ior:
 971       return lower_ior64(b, src[0], src[1]);
 972    case nir_op_ixor:
 973       return lower_ixor64(b, src[0], src[1]);
 974    case nir_op_inot:
 975       return lower_inot64(b, src[0]);
 976    case nir_op_ishl:
 977       return lower_ishl64(b, src[0], src[1]);
 978    case nir_op_ishr:
 979       return lower_ishr64(b, src[0], src[1]);
 980    case nir_op_ushr:
 981       return lower_ushr64(b, src[0], src[1]);
 982    case nir_op_extract_u8:
 983    case nir_op_extract_i8:
 984    case nir_op_extract_u16:
 985    case nir_op_extract_i16:
 986       return lower_extract(b, alu->op, src[0], src[1]);
 987    case nir_op_ufind_msb:
 988       return lower_ufind_msb64(b, src[0]);
 989    case nir_op_i2f64:
 990    case nir_op_i2f32:
 991    case nir_op_i2f16:
 992       return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), true);
 993    case nir_op_u2f64:
 994    case nir_op_u2f32:
 995    case nir_op_u2f16:
 996       return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), false);
 997    case nir_op_f2i64:
 998    case nir_op_f2u64:
 999       /* We don't support f64toi64 (yet?). */
1000       if (src[0]->bit_size > 32)
1001          return false;
1002
1003       return lower_f2(b, src[0], alu->op == nir_op_f2i64);
1004    default:
1005       unreachable("Invalid ALU opcode to lower");
1006    }
1007 }
1008
1009 static bool
1010 should_lower_int64_alu_instr(const nir_instr *instr, const void *_data)
1011 {
1012    const nir_shader_compiler_options *options =
1013       (const nir_shader_compiler_options *)_data;
1014
1015    if (instr->type != nir_instr_type_alu)
1016       return false;
1017
1018    const nir_alu_instr *alu = nir_instr_as_alu(instr);
1019
1020    switch (alu->op) {
1021    case nir_op_i2b1:
1022    case nir_op_i2i8:
1023    case nir_op_i2i16:
1024    case nir_op_i2i32:
1025    case nir_op_u2u8:
1026    case nir_op_u2u16:
1027    case nir_op_u2u32:
1028       assert(alu->src[0].src.is_ssa);
1029       if (alu->src[0].src.ssa->bit_size != 64)
1030          return false;
1031       break;
1032    case nir_op_bcsel:
1033       assert(alu->src[1].src.is_ssa);
1034       assert(alu->src[2].src.is_ssa);
1035       assert(alu->src[1].src.ssa->bit_size ==
1036              alu->src[2].src.ssa->bit_size);
1037       if (alu->src[1].src.ssa->bit_size != 64)
1038          return false;
1039       break;
1040    case nir_op_ieq:
1041    case nir_op_ine:
1042    case nir_op_ult:
1043    case nir_op_ilt:
1044    case nir_op_uge:
1045    case nir_op_ige:
1046       assert(alu->src[0].src.is_ssa);
1047       assert(alu->src[1].src.is_ssa);
1048       assert(alu->src[0].src.ssa->bit_size ==
1049              alu->src[1].src.ssa->bit_size);
1050       if (alu->src[0].src.ssa->bit_size != 64)
1051          return false;
1052       break;
1053    case nir_op_ufind_msb:
1054       assert(alu->src[0].src.is_ssa);
1055       if (alu->src[0].src.ssa->bit_size != 64)
1056          return false;
1057       break;
1058    case nir_op_amul:
1059       assert(alu->dest.dest.is_ssa);
1060       if (options->has_imul24)
1061          return false;
1062       if (alu->dest.dest.ssa.bit_size != 64)
1063          return false;
1064       break;
1065    case nir_op_i2f64:
1066    case nir_op_u2f64:
1067    case nir_op_i2f32:
1068    case nir_op_u2f32:
1069    case nir_op_i2f16:
1070    case nir_op_u2f16:
1071       assert(alu->src[0].src.is_ssa);
1072       if (alu->src[0].src.ssa->bit_size != 64)
1073          return false;
1074       break;
1075    case nir_op_f2u64:
1076    case nir_op_f2i64:
1077       /* fall-through */
1078    default:
1079       assert(alu->dest.dest.is_ssa);
1080       if (alu->dest.dest.ssa.bit_size != 64)
1081          return false;
1082       break;
1083    }
1084
1085    unsigned mask = nir_lower_int64_op_to_options_mask(alu->op);
1086    return (options->lower_int64_options & mask) != 0;
1087 }
1088
1089 bool
1090 nir_lower_int64(nir_shader *shader)
1091 {
1092    return nir_shader_lower_instructions(shader,
1093                                         should_lower_int64_alu_instr,
1094                                         lower_int64_alu_instr,
1095                                         (void *)shader->options);
1096 }