2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "nir_builder.h"
27 #define COND_LOWER_OP(b, name, ...) \
28 (b->shader->options->lower_int64_options & \
29 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \
30 lower_##name##64(b, __VA_ARGS__) : nir_##name(b, __VA_ARGS__)
32 #define COND_LOWER_CMP(b, name, ...) \
33 (b->shader->options->lower_int64_options & \
34 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \
35 lower_int64_compare(b, nir_op_##name, __VA_ARGS__) : \
36 nir_##name(b, __VA_ARGS__)
38 #define COND_LOWER_CAST(b, name, ...) \
39 (b->shader->options->lower_int64_options & \
40 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \
41 lower_##name(b, __VA_ARGS__) : \
42 nir_##name(b, __VA_ARGS__)
45 lower_b2i64(nir_builder
*b
, nir_ssa_def
*x
)
47 return nir_pack_64_2x32_split(b
, nir_b2i32(b
, x
), nir_imm_int(b
, 0));
51 lower_i2b(nir_builder
*b
, nir_ssa_def
*x
)
53 return nir_ine(b
, nir_ior(b
, nir_unpack_64_2x32_split_x(b
, x
),
54 nir_unpack_64_2x32_split_y(b
, x
)),
59 lower_i2i8(nir_builder
*b
, nir_ssa_def
*x
)
61 return nir_i2i8(b
, nir_unpack_64_2x32_split_x(b
, x
));
65 lower_i2i16(nir_builder
*b
, nir_ssa_def
*x
)
67 return nir_i2i16(b
, nir_unpack_64_2x32_split_x(b
, x
));
72 lower_i2i32(nir_builder
*b
, nir_ssa_def
*x
)
74 return nir_unpack_64_2x32_split_x(b
, x
);
78 lower_i2i64(nir_builder
*b
, nir_ssa_def
*x
)
80 nir_ssa_def
*x32
= x
->bit_size
== 32 ? x
: nir_i2i32(b
, x
);
81 return nir_pack_64_2x32_split(b
, x32
, nir_ishr(b
, x32
, nir_imm_int(b
, 31)));
85 lower_u2u8(nir_builder
*b
, nir_ssa_def
*x
)
87 return nir_u2u8(b
, nir_unpack_64_2x32_split_x(b
, x
));
91 lower_u2u16(nir_builder
*b
, nir_ssa_def
*x
)
93 return nir_u2u16(b
, nir_unpack_64_2x32_split_x(b
, x
));
97 lower_u2u32(nir_builder
*b
, nir_ssa_def
*x
)
99 return nir_unpack_64_2x32_split_x(b
, x
);
103 lower_u2u64(nir_builder
*b
, nir_ssa_def
*x
)
105 nir_ssa_def
*x32
= x
->bit_size
== 32 ? x
: nir_u2u32(b
, x
);
106 return nir_pack_64_2x32_split(b
, x32
, nir_imm_int(b
, 0));
110 lower_bcsel64(nir_builder
*b
, nir_ssa_def
*cond
, nir_ssa_def
*x
, nir_ssa_def
*y
)
112 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
113 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
114 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
115 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
117 return nir_pack_64_2x32_split(b
, nir_bcsel(b
, cond
, x_lo
, y_lo
),
118 nir_bcsel(b
, cond
, x_hi
, y_hi
));
122 lower_inot64(nir_builder
*b
, nir_ssa_def
*x
)
124 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
125 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
127 return nir_pack_64_2x32_split(b
, nir_inot(b
, x_lo
), nir_inot(b
, x_hi
));
131 lower_iand64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
133 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
134 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
135 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
136 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
138 return nir_pack_64_2x32_split(b
, nir_iand(b
, x_lo
, y_lo
),
139 nir_iand(b
, x_hi
, y_hi
));
143 lower_ior64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
145 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
146 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
147 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
148 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
150 return nir_pack_64_2x32_split(b
, nir_ior(b
, x_lo
, y_lo
),
151 nir_ior(b
, x_hi
, y_hi
));
155 lower_ixor64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
157 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
158 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
159 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
160 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
162 return nir_pack_64_2x32_split(b
, nir_ixor(b
, x_lo
, y_lo
),
163 nir_ixor(b
, x_hi
, y_hi
));
167 lower_ishl64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
171 * uint64_t lshift(uint64_t x, int c)
173 * if (c == 0) return x;
175 * uint32_t lo = LO(x), hi = HI(x);
178 * uint32_t lo_shifted = lo << c;
179 * uint32_t hi_shifted = hi << c;
180 * uint32_t lo_shifted_hi = lo >> abs(32 - c);
181 * return pack_64(lo_shifted, hi_shifted | lo_shifted_hi);
183 * uint32_t lo_shifted_hi = lo << abs(32 - c);
184 * return pack_64(0, lo_shifted_hi);
188 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
189 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
191 nir_ssa_def
*reverse_count
= nir_iabs(b
, nir_iadd(b
, y
, nir_imm_int(b
, -32)));
192 nir_ssa_def
*lo_shifted
= nir_ishl(b
, x_lo
, y
);
193 nir_ssa_def
*hi_shifted
= nir_ishl(b
, x_hi
, y
);
194 nir_ssa_def
*lo_shifted_hi
= nir_ushr(b
, x_lo
, reverse_count
);
196 nir_ssa_def
*res_if_lt_32
=
197 nir_pack_64_2x32_split(b
, lo_shifted
,
198 nir_ior(b
, hi_shifted
, lo_shifted_hi
));
199 nir_ssa_def
*res_if_ge_32
=
200 nir_pack_64_2x32_split(b
, nir_imm_int(b
, 0),
201 nir_ishl(b
, x_lo
, reverse_count
));
204 nir_ieq(b
, y
, nir_imm_int(b
, 0)), x
,
205 nir_bcsel(b
, nir_uge(b
, y
, nir_imm_int(b
, 32)),
206 res_if_ge_32
, res_if_lt_32
));
210 lower_ishr64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
214 * uint64_t arshift(uint64_t x, int c)
216 * if (c == 0) return x;
218 * uint32_t lo = LO(x);
219 * int32_t hi = HI(x);
222 * uint32_t lo_shifted = lo >> c;
223 * uint32_t hi_shifted = hi >> c;
224 * uint32_t hi_shifted_lo = hi << abs(32 - c);
225 * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
227 * uint32_t hi_shifted = hi >> 31;
228 * uint32_t hi_shifted_lo = hi >> abs(32 - c);
229 * return pack_64(hi_shifted, hi_shifted_lo);
233 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
234 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
236 nir_ssa_def
*reverse_count
= nir_iabs(b
, nir_iadd(b
, y
, nir_imm_int(b
, -32)));
237 nir_ssa_def
*lo_shifted
= nir_ushr(b
, x_lo
, y
);
238 nir_ssa_def
*hi_shifted
= nir_ishr(b
, x_hi
, y
);
239 nir_ssa_def
*hi_shifted_lo
= nir_ishl(b
, x_hi
, reverse_count
);
241 nir_ssa_def
*res_if_lt_32
=
242 nir_pack_64_2x32_split(b
, nir_ior(b
, lo_shifted
, hi_shifted_lo
),
244 nir_ssa_def
*res_if_ge_32
=
245 nir_pack_64_2x32_split(b
, nir_ishr(b
, x_hi
, reverse_count
),
246 nir_ishr(b
, x_hi
, nir_imm_int(b
, 31)));
249 nir_ieq(b
, y
, nir_imm_int(b
, 0)), x
,
250 nir_bcsel(b
, nir_uge(b
, y
, nir_imm_int(b
, 32)),
251 res_if_ge_32
, res_if_lt_32
));
255 lower_ushr64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
259 * uint64_t rshift(uint64_t x, int c)
261 * if (c == 0) return x;
263 * uint32_t lo = LO(x), hi = HI(x);
266 * uint32_t lo_shifted = lo >> c;
267 * uint32_t hi_shifted = hi >> c;
268 * uint32_t hi_shifted_lo = hi << abs(32 - c);
269 * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
271 * uint32_t hi_shifted_lo = hi >> abs(32 - c);
272 * return pack_64(0, hi_shifted_lo);
277 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
278 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
280 nir_ssa_def
*reverse_count
= nir_iabs(b
, nir_iadd(b
, y
, nir_imm_int(b
, -32)));
281 nir_ssa_def
*lo_shifted
= nir_ushr(b
, x_lo
, y
);
282 nir_ssa_def
*hi_shifted
= nir_ushr(b
, x_hi
, y
);
283 nir_ssa_def
*hi_shifted_lo
= nir_ishl(b
, x_hi
, reverse_count
);
285 nir_ssa_def
*res_if_lt_32
=
286 nir_pack_64_2x32_split(b
, nir_ior(b
, lo_shifted
, hi_shifted_lo
),
288 nir_ssa_def
*res_if_ge_32
=
289 nir_pack_64_2x32_split(b
, nir_ushr(b
, x_hi
, reverse_count
),
293 nir_ieq(b
, y
, nir_imm_int(b
, 0)), x
,
294 nir_bcsel(b
, nir_uge(b
, y
, nir_imm_int(b
, 32)),
295 res_if_ge_32
, res_if_lt_32
));
299 lower_iadd64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
301 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
302 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
303 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
304 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
306 nir_ssa_def
*res_lo
= nir_iadd(b
, x_lo
, y_lo
);
307 nir_ssa_def
*carry
= nir_b2i32(b
, nir_ult(b
, res_lo
, x_lo
));
308 nir_ssa_def
*res_hi
= nir_iadd(b
, carry
, nir_iadd(b
, x_hi
, y_hi
));
310 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
314 lower_isub64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
316 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
317 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
318 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
319 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
321 nir_ssa_def
*res_lo
= nir_isub(b
, x_lo
, y_lo
);
322 nir_ssa_def
*borrow
= nir_ineg(b
, nir_b2i32(b
, nir_ult(b
, x_lo
, y_lo
)));
323 nir_ssa_def
*res_hi
= nir_iadd(b
, nir_isub(b
, x_hi
, y_hi
), borrow
);
325 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
329 lower_ineg64(nir_builder
*b
, nir_ssa_def
*x
)
331 /* Since isub is the same number of instructions (with better dependencies)
332 * as iadd, subtraction is actually more efficient for ineg than the usual
333 * 2's complement "flip the bits and add one".
335 return lower_isub64(b
, nir_imm_int64(b
, 0), x
);
339 lower_iabs64(nir_builder
*b
, nir_ssa_def
*x
)
341 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
342 nir_ssa_def
*x_is_neg
= nir_ilt(b
, x_hi
, nir_imm_int(b
, 0));
343 return nir_bcsel(b
, x_is_neg
, nir_ineg(b
, x
), x
);
347 lower_int64_compare(nir_builder
*b
, nir_op op
, nir_ssa_def
*x
, nir_ssa_def
*y
)
349 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
350 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
351 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
352 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
356 return nir_iand(b
, nir_ieq(b
, x_hi
, y_hi
), nir_ieq(b
, x_lo
, y_lo
));
358 return nir_ior(b
, nir_ine(b
, x_hi
, y_hi
), nir_ine(b
, x_lo
, y_lo
));
360 return nir_ior(b
, nir_ult(b
, x_hi
, y_hi
),
361 nir_iand(b
, nir_ieq(b
, x_hi
, y_hi
),
362 nir_ult(b
, x_lo
, y_lo
)));
364 return nir_ior(b
, nir_ilt(b
, x_hi
, y_hi
),
365 nir_iand(b
, nir_ieq(b
, x_hi
, y_hi
),
366 nir_ult(b
, x_lo
, y_lo
)));
369 /* Lower as !(x < y) in the hopes of better CSE */
370 return nir_inot(b
, lower_int64_compare(b
, nir_op_ult
, x
, y
));
372 /* Lower as !(x < y) in the hopes of better CSE */
373 return nir_inot(b
, lower_int64_compare(b
, nir_op_ilt
, x
, y
));
375 unreachable("Invalid comparison");
380 lower_umax64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
382 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ult
, x
, y
), y
, x
);
386 lower_imax64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
388 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ilt
, x
, y
), y
, x
);
392 lower_umin64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
394 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ult
, x
, y
), x
, y
);
398 lower_imin64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
400 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ilt
, x
, y
), x
, y
);
404 lower_mul_2x32_64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
,
407 nir_ssa_def
*res_hi
= sign_extend
? nir_imul_high(b
, x
, y
)
408 : nir_umul_high(b
, x
, y
);
410 return nir_pack_64_2x32_split(b
, nir_imul(b
, x
, y
), res_hi
);
414 lower_imul64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
416 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
417 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
418 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
419 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
421 nir_ssa_def
*mul_lo
= nir_umul_2x32_64(b
, x_lo
, y_lo
);
422 nir_ssa_def
*res_hi
= nir_iadd(b
, nir_unpack_64_2x32_split_y(b
, mul_lo
),
423 nir_iadd(b
, nir_imul(b
, x_lo
, y_hi
),
424 nir_imul(b
, x_hi
, y_lo
)));
426 return nir_pack_64_2x32_split(b
, nir_unpack_64_2x32_split_x(b
, mul_lo
),
431 lower_mul_high64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
,
434 nir_ssa_def
*x32
[4], *y32
[4];
435 x32
[0] = nir_unpack_64_2x32_split_x(b
, x
);
436 x32
[1] = nir_unpack_64_2x32_split_y(b
, x
);
438 x32
[2] = x32
[3] = nir_ishr(b
, x32
[1], nir_imm_int(b
, 31));
440 x32
[2] = x32
[3] = nir_imm_int(b
, 0);
443 y32
[0] = nir_unpack_64_2x32_split_x(b
, y
);
444 y32
[1] = nir_unpack_64_2x32_split_y(b
, y
);
446 y32
[2] = y32
[3] = nir_ishr(b
, y32
[1], nir_imm_int(b
, 31));
448 y32
[2] = y32
[3] = nir_imm_int(b
, 0);
451 nir_ssa_def
*res
[8] = { NULL
, };
453 /* Yes, the following generates a pile of code. However, we throw res[0]
454 * and res[1] away in the end and, if we're in the umul case, four of our
455 * eight dword operands will be constant zero and opt_algebraic will clean
458 for (unsigned i
= 0; i
< 4; i
++) {
459 nir_ssa_def
*carry
= NULL
;
460 for (unsigned j
= 0; j
< 4; j
++) {
461 /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
462 * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum
463 * value that will fit in tmp is
465 * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
466 * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
467 * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
469 * so we're guaranteed that we can add in two more 32-bit values
470 * without overflowing tmp.
472 nir_ssa_def
*tmp
= nir_umul_2x32_64(b
, x32
[i
], y32
[i
]);
475 tmp
= nir_iadd(b
, tmp
, nir_u2u64(b
, res
[i
+ j
]));
477 tmp
= nir_iadd(b
, tmp
, carry
);
478 res
[i
+ j
] = nir_u2u32(b
, tmp
);
479 carry
= nir_ushr(b
, tmp
, nir_imm_int(b
, 32));
481 res
[i
+ 4] = nir_u2u32(b
, carry
);
484 return nir_pack_64_2x32_split(b
, res
[2], res
[3]);
488 lower_isign64(nir_builder
*b
, nir_ssa_def
*x
)
490 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
491 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
493 nir_ssa_def
*is_non_zero
= nir_i2b(b
, nir_ior(b
, x_lo
, x_hi
));
494 nir_ssa_def
*res_hi
= nir_ishr(b
, x_hi
, nir_imm_int(b
, 31));
495 nir_ssa_def
*res_lo
= nir_ior(b
, res_hi
, nir_b2i32(b
, is_non_zero
));
497 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
501 lower_udiv64_mod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
,
502 nir_ssa_def
**q
, nir_ssa_def
**r
)
504 /* TODO: We should specially handle the case where the denominator is a
505 * constant. In that case, we should be able to reduce it to a multiply by
506 * a constant, some shifts, and an add.
508 nir_ssa_def
*n_lo
= nir_unpack_64_2x32_split_x(b
, n
);
509 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
510 nir_ssa_def
*d_lo
= nir_unpack_64_2x32_split_x(b
, d
);
511 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
513 nir_ssa_def
*q_lo
= nir_imm_zero(b
, n
->num_components
, 32);
514 nir_ssa_def
*q_hi
= nir_imm_zero(b
, n
->num_components
, 32);
516 nir_ssa_def
*n_hi_before_if
= n_hi
;
517 nir_ssa_def
*q_hi_before_if
= q_hi
;
519 /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
520 * greater than 32 bits to occur. If the upper 32 bits of the numerator
521 * are zero, it is impossible for (denom << [63, 32]) <= numer unless
524 nir_ssa_def
*need_high_div
=
525 nir_iand(b
, nir_ieq(b
, d_hi
, nir_imm_int(b
, 0)), nir_uge(b
, n_hi
, d_lo
));
526 nir_push_if(b
, nir_bany(b
, need_high_div
));
528 /* If we only have one component, then the bany above goes away and
529 * this is always true within the if statement.
531 if (n
->num_components
== 1)
532 need_high_div
= nir_imm_true(b
);
534 nir_ssa_def
*log2_d_lo
= nir_ufind_msb(b
, d_lo
);
536 for (int i
= 31; i
>= 0; i
--) {
537 /* if ((d.x << i) <= n.y) {
542 nir_ssa_def
*d_shift
= nir_ishl(b
, d_lo
, nir_imm_int(b
, i
));
543 nir_ssa_def
*new_n_hi
= nir_isub(b
, n_hi
, d_shift
);
544 nir_ssa_def
*new_q_hi
= nir_ior(b
, q_hi
, nir_imm_int(b
, 1u << i
));
545 nir_ssa_def
*cond
= nir_iand(b
, need_high_div
,
546 nir_uge(b
, n_hi
, d_shift
));
548 /* log2_d_lo is always <= 31, so we don't need to bother with it
549 * in the last iteration.
551 cond
= nir_iand(b
, cond
,
552 nir_ige(b
, nir_imm_int(b
, 31 - i
), log2_d_lo
));
554 n_hi
= nir_bcsel(b
, cond
, new_n_hi
, n_hi
);
555 q_hi
= nir_bcsel(b
, cond
, new_q_hi
, q_hi
);
559 n_hi
= nir_if_phi(b
, n_hi
, n_hi_before_if
);
560 q_hi
= nir_if_phi(b
, q_hi
, q_hi_before_if
);
562 nir_ssa_def
*log2_denom
= nir_ufind_msb(b
, d_hi
);
564 n
= nir_pack_64_2x32_split(b
, n_lo
, n_hi
);
565 d
= nir_pack_64_2x32_split(b
, d_lo
, d_hi
);
566 for (int i
= 31; i
>= 0; i
--) {
567 /* if ((d64 << i) <= n64) {
572 nir_ssa_def
*d_shift
= nir_ishl(b
, d
, nir_imm_int(b
, i
));
573 nir_ssa_def
*new_n
= nir_isub(b
, n
, d_shift
);
574 nir_ssa_def
*new_q_lo
= nir_ior(b
, q_lo
, nir_imm_int(b
, 1u << i
));
575 nir_ssa_def
*cond
= nir_uge(b
, n
, d_shift
);
577 /* log2_denom is always <= 31, so we don't need to bother with it
578 * in the last iteration.
580 cond
= nir_iand(b
, cond
,
581 nir_ige(b
, nir_imm_int(b
, 31 - i
), log2_denom
));
583 n
= nir_bcsel(b
, cond
, new_n
, n
);
584 q_lo
= nir_bcsel(b
, cond
, new_q_lo
, q_lo
);
587 *q
= nir_pack_64_2x32_split(b
, q_lo
, q_hi
);
592 lower_udiv64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
595 lower_udiv64_mod64(b
, n
, d
, &q
, &r
);
600 lower_idiv64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
602 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
603 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
605 nir_ssa_def
*negate
= nir_ine(b
, nir_ilt(b
, n_hi
, nir_imm_int(b
, 0)),
606 nir_ilt(b
, d_hi
, nir_imm_int(b
, 0)));
608 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
609 return nir_bcsel(b
, negate
, nir_ineg(b
, q
), q
);
613 lower_umod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
616 lower_udiv64_mod64(b
, n
, d
, &q
, &r
);
621 lower_imod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
623 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
624 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
625 nir_ssa_def
*n_is_neg
= nir_ilt(b
, n_hi
, nir_imm_int(b
, 0));
626 nir_ssa_def
*d_is_neg
= nir_ilt(b
, d_hi
, nir_imm_int(b
, 0));
629 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
631 nir_ssa_def
*rem
= nir_bcsel(b
, n_is_neg
, nir_ineg(b
, r
), r
);
633 return nir_bcsel(b
, nir_ieq(b
, r
, nir_imm_int64(b
, 0)), nir_imm_int64(b
, 0),
634 nir_bcsel(b
, nir_ieq(b
, n_is_neg
, d_is_neg
), rem
,
635 nir_iadd(b
, rem
, d
)));
639 lower_irem64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
641 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
642 nir_ssa_def
*n_is_neg
= nir_ilt(b
, n_hi
, nir_imm_int(b
, 0));
645 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
646 return nir_bcsel(b
, n_is_neg
, nir_ineg(b
, r
), r
);
650 lower_extract(nir_builder
*b
, nir_op op
, nir_ssa_def
*x
, nir_ssa_def
*c
)
652 assert(op
== nir_op_extract_u8
|| op
== nir_op_extract_i8
||
653 op
== nir_op_extract_u16
|| op
== nir_op_extract_i16
);
655 const int chunk
= nir_src_as_uint(nir_src_for_ssa(c
));
656 const int chunk_bits
=
657 (op
== nir_op_extract_u8
|| op
== nir_op_extract_i8
) ? 8 : 16;
658 const int num_chunks_in_32
= 32 / chunk_bits
;
660 nir_ssa_def
*extract32
;
661 if (chunk
< num_chunks_in_32
) {
662 extract32
= nir_build_alu(b
, op
, nir_unpack_64_2x32_split_x(b
, x
),
663 nir_imm_int(b
, chunk
),
666 extract32
= nir_build_alu(b
, op
, nir_unpack_64_2x32_split_y(b
, x
),
667 nir_imm_int(b
, chunk
- num_chunks_in_32
),
671 if (op
== nir_op_extract_i8
|| op
== nir_op_extract_i16
)
672 return lower_i2i64(b
, extract32
);
674 return lower_u2u64(b
, extract32
);
678 lower_ufind_msb64(nir_builder
*b
, nir_ssa_def
*x
)
681 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
682 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
683 nir_ssa_def
*lo_count
= nir_ufind_msb(b
, x_lo
);
684 nir_ssa_def
*hi_count
= nir_ufind_msb(b
, x_hi
);
685 nir_ssa_def
*valid_hi_bits
= nir_ine(b
, x_hi
, nir_imm_int(b
, 0));
686 nir_ssa_def
*hi_res
= nir_iadd(b
, nir_imm_intN_t(b
, 32, 32), hi_count
);
687 return nir_bcsel(b
, valid_hi_bits
, hi_res
, lo_count
);
691 lower_2f(nir_builder
*b
, nir_ssa_def
*x
, unsigned dest_bit_size
,
694 nir_ssa_def
*x_sign
= NULL
;
697 x_sign
= nir_bcsel(b
, COND_LOWER_CMP(b
, ilt
, x
, nir_imm_int64(b
, 0)),
698 nir_imm_floatN_t(b
, -1, dest_bit_size
),
699 nir_imm_floatN_t(b
, 1, dest_bit_size
));
700 x
= COND_LOWER_OP(b
, iabs
, x
);
703 nir_ssa_def
*exp
= COND_LOWER_OP(b
, ufind_msb
, x
);
704 unsigned significand_bits
;
706 switch (dest_bit_size
) {
708 significand_bits
= 23;
711 significand_bits
= 10;
714 unreachable("Invalid dest_bit_size");
717 /* We keep one more bit than can fit in the significand field to let the
718 * u2f32 conversion do the rounding for us.
720 nir_ssa_def
*discard
=
721 nir_imax(b
, nir_isub(b
, exp
, nir_imm_int(b
, significand_bits
+ 1)),
724 /* Part of the "round to nearest" has to be taken care of before we discard
725 * the LSB, and that's what this extra iadd is for.
726 * "Round to nearest even" is handled by u2f. That works because the
727 * shifted value either fits in the significand field (which means no
728 * rounding is required) or contains one extra bit that forces the
729 * conversion op to round things properly.
731 nir_ssa_def
*add
= COND_LOWER_OP(b
, ishl
, nir_imm_int64(b
, 1), discard
);
732 add
= COND_LOWER_OP(b
, isub
, add
, nir_imm_int64(b
, 1));
733 nir_ssa_def
*rounded_x
= COND_LOWER_OP(b
, iadd
, x
, add
);
735 /* Signed Values can't overflow because we've saved the sign and promoted
736 * them to unsigned values.
738 if (!src_is_signed
) {
739 nir_ssa_def
*overflow
= COND_LOWER_CMP(b
, ult
, rounded_x
, x
);
740 rounded_x
= COND_LOWER_OP(b
, bcsel
, overflow
,
741 nir_imm_int64(b
, UINT64_MAX
), rounded_x
);
744 nir_ssa_def
*significand
= COND_LOWER_OP(b
, ushr
, rounded_x
, discard
);
745 significand
= COND_LOWER_CAST(b
, u2u32
, significand
);
749 if (dest_bit_size
== 32)
750 res
= nir_fmul(b
, nir_u2f32(b
, significand
),
751 nir_fexp2(b
, nir_u2f32(b
, discard
)));
753 res
= nir_fmul(b
, nir_u2f16(b
, significand
),
754 nir_fexp2(b
, nir_u2f16(b
, discard
)));
757 res
= nir_fmul(b
, res
, x_sign
);
763 lower_f2(nir_builder
*b
, nir_ssa_def
*x
, bool dst_is_signed
)
765 assert(x
->bit_size
== 16 || x
->bit_size
== 32);
766 nir_ssa_def
*x_sign
= NULL
;
769 x_sign
= nir_fsign(b
, x
);
771 x
= nir_fmin(b
, x
, nir_imm_floatN_t(b
, UINT64_MAX
, x
->bit_size
));
773 x
= nir_ftrunc(b
, x
);
776 x
= nir_fmin(b
, x
, nir_imm_floatN_t(b
, INT64_MAX
, x
->bit_size
));
777 x
= nir_fmax(b
, x
, nir_imm_floatN_t(b
, INT64_MIN
, x
->bit_size
));
781 nir_ssa_def
*div
= nir_imm_floatN_t(b
, 1ULL << 32, x
->bit_size
);
782 nir_ssa_def
*res_hi
= nir_f2u32(b
, nir_fdiv(b
, x
, div
));
783 nir_ssa_def
*res_lo
= nir_f2u32(b
, nir_frem(b
, x
, div
));
784 nir_ssa_def
*res
= nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
787 res
= nir_bcsel(b
, nir_flt(b
, x_sign
, nir_imm_float(b
, 0)),
788 nir_ineg(b
, res
), res
);
793 nir_lower_int64_options
794 nir_lower_int64_op_to_options_mask(nir_op opcode
)
799 return nir_lower_imul64
;
800 case nir_op_imul_2x32_64
:
801 case nir_op_umul_2x32_64
:
802 return nir_lower_imul_2x32_64
;
803 case nir_op_imul_high
:
804 case nir_op_umul_high
:
805 return nir_lower_imul_high64
;
807 return nir_lower_isign64
;
813 return nir_lower_divmod64
;
831 return nir_lower_mov64
;
838 return nir_lower_icmp64
;
841 return nir_lower_iadd64
;
852 return nir_lower_minmax64
;
854 return nir_lower_iabs64
;
856 return nir_lower_ineg64
;
861 return nir_lower_logic64
;
865 return nir_lower_shift64
;
866 case nir_op_extract_u8
:
867 case nir_op_extract_i8
:
868 case nir_op_extract_u16
:
869 case nir_op_extract_i16
:
870 return nir_lower_extract64
;
871 case nir_op_ufind_msb
:
872 return nir_lower_ufind_msb64
;
879 lower_int64_alu_instr(nir_builder
*b
, nir_instr
*instr
, void *_state
)
881 nir_alu_instr
*alu
= nir_instr_as_alu(instr
);
884 for (unsigned i
= 0; i
< nir_op_infos
[alu
->op
].num_inputs
; i
++)
885 src
[i
] = nir_ssa_for_alu_src(b
, alu
, i
);
890 return lower_imul64(b
, src
[0], src
[1]);
891 case nir_op_imul_2x32_64
:
892 return lower_mul_2x32_64(b
, src
[0], src
[1], true);
893 case nir_op_umul_2x32_64
:
894 return lower_mul_2x32_64(b
, src
[0], src
[1], false);
895 case nir_op_imul_high
:
896 return lower_mul_high64(b
, src
[0], src
[1], true);
897 case nir_op_umul_high
:
898 return lower_mul_high64(b
, src
[0], src
[1], false);
900 return lower_isign64(b
, src
[0]);
902 return lower_udiv64(b
, src
[0], src
[1]);
904 return lower_idiv64(b
, src
[0], src
[1]);
906 return lower_umod64(b
, src
[0], src
[1]);
908 return lower_imod64(b
, src
[0], src
[1]);
910 return lower_irem64(b
, src
[0], src
[1]);
912 return lower_b2i64(b
, src
[0]);
914 return lower_i2b(b
, src
[0]);
916 return lower_i2i8(b
, src
[0]);
918 return lower_i2i16(b
, src
[0]);
920 return lower_i2i32(b
, src
[0]);
922 return lower_i2i64(b
, src
[0]);
924 return lower_u2u8(b
, src
[0]);
926 return lower_u2u16(b
, src
[0]);
928 return lower_u2u32(b
, src
[0]);
930 return lower_u2u64(b
, src
[0]);
932 return lower_bcsel64(b
, src
[0], src
[1], src
[2]);
939 return lower_int64_compare(b
, alu
->op
, src
[0], src
[1]);
941 return lower_iadd64(b
, src
[0], src
[1]);
943 return lower_isub64(b
, src
[0], src
[1]);
945 return lower_imin64(b
, src
[0], src
[1]);
947 return lower_imax64(b
, src
[0], src
[1]);
949 return lower_umin64(b
, src
[0], src
[1]);
951 return lower_umax64(b
, src
[0], src
[1]);
953 return lower_imin64(b
, src
[0], lower_imin64(b
, src
[1], src
[2]));
955 return lower_imax64(b
, src
[0], lower_imax64(b
, src
[1], src
[2]));
957 return lower_umin64(b
, src
[0], lower_umin64(b
, src
[1], src
[2]));
959 return lower_umax64(b
, src
[0], lower_umax64(b
, src
[1], src
[2]));
961 return lower_imax64(b
, lower_imin64(b
, lower_imax64(b
, src
[0], src
[1]), src
[2]), lower_imin64(b
, src
[0], src
[1]));
963 return lower_umax64(b
, lower_umin64(b
, lower_umax64(b
, src
[0], src
[1]), src
[2]), lower_umin64(b
, src
[0], src
[1]));
965 return lower_iabs64(b
, src
[0]);
967 return lower_ineg64(b
, src
[0]);
969 return lower_iand64(b
, src
[0], src
[1]);
971 return lower_ior64(b
, src
[0], src
[1]);
973 return lower_ixor64(b
, src
[0], src
[1]);
975 return lower_inot64(b
, src
[0]);
977 return lower_ishl64(b
, src
[0], src
[1]);
979 return lower_ishr64(b
, src
[0], src
[1]);
981 return lower_ushr64(b
, src
[0], src
[1]);
982 case nir_op_extract_u8
:
983 case nir_op_extract_i8
:
984 case nir_op_extract_u16
:
985 case nir_op_extract_i16
:
986 return lower_extract(b
, alu
->op
, src
[0], src
[1]);
987 case nir_op_ufind_msb
:
988 return lower_ufind_msb64(b
, src
[0]);
992 return lower_2f(b
, src
[0], nir_dest_bit_size(alu
->dest
.dest
), true);
996 return lower_2f(b
, src
[0], nir_dest_bit_size(alu
->dest
.dest
), false);
999 /* We don't support f64toi64 (yet?). */
1000 if (src
[0]->bit_size
> 32)
1003 return lower_f2(b
, src
[0], alu
->op
== nir_op_f2i64
);
1005 unreachable("Invalid ALU opcode to lower");
1010 should_lower_int64_alu_instr(const nir_instr
*instr
, const void *_data
)
1012 const nir_shader_compiler_options
*options
=
1013 (const nir_shader_compiler_options
*)_data
;
1015 if (instr
->type
!= nir_instr_type_alu
)
1018 const nir_alu_instr
*alu
= nir_instr_as_alu(instr
);
1028 assert(alu
->src
[0].src
.is_ssa
);
1029 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1033 assert(alu
->src
[1].src
.is_ssa
);
1034 assert(alu
->src
[2].src
.is_ssa
);
1035 assert(alu
->src
[1].src
.ssa
->bit_size
==
1036 alu
->src
[2].src
.ssa
->bit_size
);
1037 if (alu
->src
[1].src
.ssa
->bit_size
!= 64)
1046 assert(alu
->src
[0].src
.is_ssa
);
1047 assert(alu
->src
[1].src
.is_ssa
);
1048 assert(alu
->src
[0].src
.ssa
->bit_size
==
1049 alu
->src
[1].src
.ssa
->bit_size
);
1050 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1053 case nir_op_ufind_msb
:
1054 assert(alu
->src
[0].src
.is_ssa
);
1055 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1059 assert(alu
->dest
.dest
.is_ssa
);
1060 if (options
->has_imul24
)
1062 if (alu
->dest
.dest
.ssa
.bit_size
!= 64)
1071 assert(alu
->src
[0].src
.is_ssa
);
1072 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1079 assert(alu
->dest
.dest
.is_ssa
);
1080 if (alu
->dest
.dest
.ssa
.bit_size
!= 64)
1085 unsigned mask
= nir_lower_int64_op_to_options_mask(alu
->op
);
1086 return (options
->lower_int64_options
& mask
) != 0;
1090 nir_lower_int64(nir_shader
*shader
)
1092 return nir_shader_lower_instructions(shader
,
1093 should_lower_int64_alu_instr
,
1094 lower_int64_alu_instr
,
1095 (void *)shader
->options
);