2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "nir_builder.h"
27 #define COND_LOWER_OP(b, name, ...) \
28 (b->shader->options->lower_int64_options & \
29 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \
30 lower_##name##64(b, __VA_ARGS__) : nir_##name(b, __VA_ARGS__)
32 #define COND_LOWER_CMP(b, name, ...) \
33 (b->shader->options->lower_int64_options & \
34 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \
35 lower_int64_compare(b, nir_op_##name, __VA_ARGS__) : \
36 nir_##name(b, __VA_ARGS__)
38 #define COND_LOWER_CAST(b, name, ...) \
39 (b->shader->options->lower_int64_options & \
40 nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \
41 lower_##name(b, __VA_ARGS__) : \
42 nir_##name(b, __VA_ARGS__)
45 lower_b2i64(nir_builder
*b
, nir_ssa_def
*x
)
47 return nir_pack_64_2x32_split(b
, nir_b2i32(b
, x
), nir_imm_int(b
, 0));
51 lower_i2b(nir_builder
*b
, nir_ssa_def
*x
)
53 return nir_ine(b
, nir_ior(b
, nir_unpack_64_2x32_split_x(b
, x
),
54 nir_unpack_64_2x32_split_y(b
, x
)),
59 lower_i2i8(nir_builder
*b
, nir_ssa_def
*x
)
61 return nir_i2i8(b
, nir_unpack_64_2x32_split_x(b
, x
));
65 lower_i2i16(nir_builder
*b
, nir_ssa_def
*x
)
67 return nir_i2i16(b
, nir_unpack_64_2x32_split_x(b
, x
));
72 lower_i2i32(nir_builder
*b
, nir_ssa_def
*x
)
74 return nir_unpack_64_2x32_split_x(b
, x
);
78 lower_i2i64(nir_builder
*b
, nir_ssa_def
*x
)
80 nir_ssa_def
*x32
= x
->bit_size
== 32 ? x
: nir_i2i32(b
, x
);
81 return nir_pack_64_2x32_split(b
, x32
, nir_ishr_imm(b
, x32
, 31));
85 lower_u2u8(nir_builder
*b
, nir_ssa_def
*x
)
87 return nir_u2u8(b
, nir_unpack_64_2x32_split_x(b
, x
));
91 lower_u2u16(nir_builder
*b
, nir_ssa_def
*x
)
93 return nir_u2u16(b
, nir_unpack_64_2x32_split_x(b
, x
));
97 lower_u2u32(nir_builder
*b
, nir_ssa_def
*x
)
99 return nir_unpack_64_2x32_split_x(b
, x
);
103 lower_u2u64(nir_builder
*b
, nir_ssa_def
*x
)
105 nir_ssa_def
*x32
= x
->bit_size
== 32 ? x
: nir_u2u32(b
, x
);
106 return nir_pack_64_2x32_split(b
, x32
, nir_imm_int(b
, 0));
110 lower_bcsel64(nir_builder
*b
, nir_ssa_def
*cond
, nir_ssa_def
*x
, nir_ssa_def
*y
)
112 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
113 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
114 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
115 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
117 return nir_pack_64_2x32_split(b
, nir_bcsel(b
, cond
, x_lo
, y_lo
),
118 nir_bcsel(b
, cond
, x_hi
, y_hi
));
122 lower_inot64(nir_builder
*b
, nir_ssa_def
*x
)
124 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
125 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
127 return nir_pack_64_2x32_split(b
, nir_inot(b
, x_lo
), nir_inot(b
, x_hi
));
131 lower_iand64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
133 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
134 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
135 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
136 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
138 return nir_pack_64_2x32_split(b
, nir_iand(b
, x_lo
, y_lo
),
139 nir_iand(b
, x_hi
, y_hi
));
143 lower_ior64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
145 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
146 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
147 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
148 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
150 return nir_pack_64_2x32_split(b
, nir_ior(b
, x_lo
, y_lo
),
151 nir_ior(b
, x_hi
, y_hi
));
155 lower_ixor64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
157 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
158 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
159 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
160 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
162 return nir_pack_64_2x32_split(b
, nir_ixor(b
, x_lo
, y_lo
),
163 nir_ixor(b
, x_hi
, y_hi
));
167 lower_ishl64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
171 * uint64_t lshift(uint64_t x, int c)
173 * if (c == 0) return x;
175 * uint32_t lo = LO(x), hi = HI(x);
178 * uint32_t lo_shifted = lo << c;
179 * uint32_t hi_shifted = hi << c;
180 * uint32_t lo_shifted_hi = lo >> abs(32 - c);
181 * return pack_64(lo_shifted, hi_shifted | lo_shifted_hi);
183 * uint32_t lo_shifted_hi = lo << abs(32 - c);
184 * return pack_64(0, lo_shifted_hi);
188 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
189 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
191 nir_ssa_def
*reverse_count
= nir_iabs(b
, nir_iadd(b
, y
, nir_imm_int(b
, -32)));
192 nir_ssa_def
*lo_shifted
= nir_ishl(b
, x_lo
, y
);
193 nir_ssa_def
*hi_shifted
= nir_ishl(b
, x_hi
, y
);
194 nir_ssa_def
*lo_shifted_hi
= nir_ushr(b
, x_lo
, reverse_count
);
196 nir_ssa_def
*res_if_lt_32
=
197 nir_pack_64_2x32_split(b
, lo_shifted
,
198 nir_ior(b
, hi_shifted
, lo_shifted_hi
));
199 nir_ssa_def
*res_if_ge_32
=
200 nir_pack_64_2x32_split(b
, nir_imm_int(b
, 0),
201 nir_ishl(b
, x_lo
, reverse_count
));
204 nir_ieq(b
, y
, nir_imm_int(b
, 0)), x
,
205 nir_bcsel(b
, nir_uge(b
, y
, nir_imm_int(b
, 32)),
206 res_if_ge_32
, res_if_lt_32
));
210 lower_ishr64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
214 * uint64_t arshift(uint64_t x, int c)
216 * if (c == 0) return x;
218 * uint32_t lo = LO(x);
219 * int32_t hi = HI(x);
222 * uint32_t lo_shifted = lo >> c;
223 * uint32_t hi_shifted = hi >> c;
224 * uint32_t hi_shifted_lo = hi << abs(32 - c);
225 * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
227 * uint32_t hi_shifted = hi >> 31;
228 * uint32_t hi_shifted_lo = hi >> abs(32 - c);
229 * return pack_64(hi_shifted, hi_shifted_lo);
233 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
234 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
236 nir_ssa_def
*reverse_count
= nir_iabs(b
, nir_iadd(b
, y
, nir_imm_int(b
, -32)));
237 nir_ssa_def
*lo_shifted
= nir_ushr(b
, x_lo
, y
);
238 nir_ssa_def
*hi_shifted
= nir_ishr(b
, x_hi
, y
);
239 nir_ssa_def
*hi_shifted_lo
= nir_ishl(b
, x_hi
, reverse_count
);
241 nir_ssa_def
*res_if_lt_32
=
242 nir_pack_64_2x32_split(b
, nir_ior(b
, lo_shifted
, hi_shifted_lo
),
244 nir_ssa_def
*res_if_ge_32
=
245 nir_pack_64_2x32_split(b
, nir_ishr(b
, x_hi
, reverse_count
),
246 nir_ishr(b
, x_hi
, nir_imm_int(b
, 31)));
249 nir_ieq(b
, y
, nir_imm_int(b
, 0)), x
,
250 nir_bcsel(b
, nir_uge(b
, y
, nir_imm_int(b
, 32)),
251 res_if_ge_32
, res_if_lt_32
));
255 lower_ushr64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
259 * uint64_t rshift(uint64_t x, int c)
261 * if (c == 0) return x;
263 * uint32_t lo = LO(x), hi = HI(x);
266 * uint32_t lo_shifted = lo >> c;
267 * uint32_t hi_shifted = hi >> c;
268 * uint32_t hi_shifted_lo = hi << abs(32 - c);
269 * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
271 * uint32_t hi_shifted_lo = hi >> abs(32 - c);
272 * return pack_64(0, hi_shifted_lo);
277 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
278 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
280 nir_ssa_def
*reverse_count
= nir_iabs(b
, nir_iadd(b
, y
, nir_imm_int(b
, -32)));
281 nir_ssa_def
*lo_shifted
= nir_ushr(b
, x_lo
, y
);
282 nir_ssa_def
*hi_shifted
= nir_ushr(b
, x_hi
, y
);
283 nir_ssa_def
*hi_shifted_lo
= nir_ishl(b
, x_hi
, reverse_count
);
285 nir_ssa_def
*res_if_lt_32
=
286 nir_pack_64_2x32_split(b
, nir_ior(b
, lo_shifted
, hi_shifted_lo
),
288 nir_ssa_def
*res_if_ge_32
=
289 nir_pack_64_2x32_split(b
, nir_ushr(b
, x_hi
, reverse_count
),
293 nir_ieq(b
, y
, nir_imm_int(b
, 0)), x
,
294 nir_bcsel(b
, nir_uge(b
, y
, nir_imm_int(b
, 32)),
295 res_if_ge_32
, res_if_lt_32
));
299 lower_iadd64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
301 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
302 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
303 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
304 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
306 nir_ssa_def
*res_lo
= nir_iadd(b
, x_lo
, y_lo
);
307 nir_ssa_def
*carry
= nir_b2i32(b
, nir_ult(b
, res_lo
, x_lo
));
308 nir_ssa_def
*res_hi
= nir_iadd(b
, carry
, nir_iadd(b
, x_hi
, y_hi
));
310 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
314 lower_isub64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
316 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
317 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
318 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
319 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
321 nir_ssa_def
*res_lo
= nir_isub(b
, x_lo
, y_lo
);
322 nir_ssa_def
*borrow
= nir_ineg(b
, nir_b2i32(b
, nir_ult(b
, x_lo
, y_lo
)));
323 nir_ssa_def
*res_hi
= nir_iadd(b
, nir_isub(b
, x_hi
, y_hi
), borrow
);
325 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
329 lower_ineg64(nir_builder
*b
, nir_ssa_def
*x
)
331 /* Since isub is the same number of instructions (with better dependencies)
332 * as iadd, subtraction is actually more efficient for ineg than the usual
333 * 2's complement "flip the bits and add one".
335 return lower_isub64(b
, nir_imm_int64(b
, 0), x
);
339 lower_iabs64(nir_builder
*b
, nir_ssa_def
*x
)
341 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
342 nir_ssa_def
*x_is_neg
= nir_ilt(b
, x_hi
, nir_imm_int(b
, 0));
343 return nir_bcsel(b
, x_is_neg
, nir_ineg(b
, x
), x
);
347 lower_int64_compare(nir_builder
*b
, nir_op op
, nir_ssa_def
*x
, nir_ssa_def
*y
)
349 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
350 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
351 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
352 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
356 return nir_iand(b
, nir_ieq(b
, x_hi
, y_hi
), nir_ieq(b
, x_lo
, y_lo
));
358 return nir_ior(b
, nir_ine(b
, x_hi
, y_hi
), nir_ine(b
, x_lo
, y_lo
));
360 return nir_ior(b
, nir_ult(b
, x_hi
, y_hi
),
361 nir_iand(b
, nir_ieq(b
, x_hi
, y_hi
),
362 nir_ult(b
, x_lo
, y_lo
)));
364 return nir_ior(b
, nir_ilt(b
, x_hi
, y_hi
),
365 nir_iand(b
, nir_ieq(b
, x_hi
, y_hi
),
366 nir_ult(b
, x_lo
, y_lo
)));
369 /* Lower as !(x < y) in the hopes of better CSE */
370 return nir_inot(b
, lower_int64_compare(b
, nir_op_ult
, x
, y
));
372 /* Lower as !(x < y) in the hopes of better CSE */
373 return nir_inot(b
, lower_int64_compare(b
, nir_op_ilt
, x
, y
));
375 unreachable("Invalid comparison");
380 lower_umax64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
382 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ult
, x
, y
), y
, x
);
386 lower_imax64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
388 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ilt
, x
, y
), y
, x
);
392 lower_umin64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
394 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ult
, x
, y
), x
, y
);
398 lower_imin64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
400 return nir_bcsel(b
, lower_int64_compare(b
, nir_op_ilt
, x
, y
), x
, y
);
404 lower_mul_2x32_64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
,
407 nir_ssa_def
*res_hi
= sign_extend
? nir_imul_high(b
, x
, y
)
408 : nir_umul_high(b
, x
, y
);
410 return nir_pack_64_2x32_split(b
, nir_imul(b
, x
, y
), res_hi
);
414 lower_imul64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
416 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
417 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
418 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
419 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
421 nir_ssa_def
*mul_lo
= nir_umul_2x32_64(b
, x_lo
, y_lo
);
422 nir_ssa_def
*res_hi
= nir_iadd(b
, nir_unpack_64_2x32_split_y(b
, mul_lo
),
423 nir_iadd(b
, nir_imul(b
, x_lo
, y_hi
),
424 nir_imul(b
, x_hi
, y_lo
)));
426 return nir_pack_64_2x32_split(b
, nir_unpack_64_2x32_split_x(b
, mul_lo
),
431 lower_mul_high64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
,
434 nir_ssa_def
*x32
[4], *y32
[4];
435 x32
[0] = nir_unpack_64_2x32_split_x(b
, x
);
436 x32
[1] = nir_unpack_64_2x32_split_y(b
, x
);
438 x32
[2] = x32
[3] = nir_ishr_imm(b
, x32
[1], 31);
440 x32
[2] = x32
[3] = nir_imm_int(b
, 0);
443 y32
[0] = nir_unpack_64_2x32_split_x(b
, y
);
444 y32
[1] = nir_unpack_64_2x32_split_y(b
, y
);
446 y32
[2] = y32
[3] = nir_ishr_imm(b
, y32
[1], 31);
448 y32
[2] = y32
[3] = nir_imm_int(b
, 0);
451 nir_ssa_def
*res
[8] = { NULL
, };
453 /* Yes, the following generates a pile of code. However, we throw res[0]
454 * and res[1] away in the end and, if we're in the umul case, four of our
455 * eight dword operands will be constant zero and opt_algebraic will clean
458 for (unsigned i
= 0; i
< 4; i
++) {
459 nir_ssa_def
*carry
= NULL
;
460 for (unsigned j
= 0; j
< 4; j
++) {
461 /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
462 * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum
463 * value that will fit in tmp is
465 * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
466 * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
467 * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
469 * so we're guaranteed that we can add in two more 32-bit values
470 * without overflowing tmp.
472 nir_ssa_def
*tmp
= nir_umul_2x32_64(b
, x32
[i
], y32
[i
]);
475 tmp
= nir_iadd(b
, tmp
, nir_u2u64(b
, res
[i
+ j
]));
477 tmp
= nir_iadd(b
, tmp
, carry
);
478 res
[i
+ j
] = nir_u2u32(b
, tmp
);
479 carry
= nir_ushr_imm(b
, tmp
, 32);
481 res
[i
+ 4] = nir_u2u32(b
, carry
);
484 return nir_pack_64_2x32_split(b
, res
[2], res
[3]);
488 lower_isign64(nir_builder
*b
, nir_ssa_def
*x
)
490 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
491 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
493 nir_ssa_def
*is_non_zero
= nir_i2b(b
, nir_ior(b
, x_lo
, x_hi
));
494 nir_ssa_def
*res_hi
= nir_ishr_imm(b
, x_hi
, 31);
495 nir_ssa_def
*res_lo
= nir_ior(b
, res_hi
, nir_b2i32(b
, is_non_zero
));
497 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
501 lower_udiv64_mod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
,
502 nir_ssa_def
**q
, nir_ssa_def
**r
)
504 /* TODO: We should specially handle the case where the denominator is a
505 * constant. In that case, we should be able to reduce it to a multiply by
506 * a constant, some shifts, and an add.
508 nir_ssa_def
*n_lo
= nir_unpack_64_2x32_split_x(b
, n
);
509 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
510 nir_ssa_def
*d_lo
= nir_unpack_64_2x32_split_x(b
, d
);
511 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
513 nir_ssa_def
*q_lo
= nir_imm_zero(b
, n
->num_components
, 32);
514 nir_ssa_def
*q_hi
= nir_imm_zero(b
, n
->num_components
, 32);
516 nir_ssa_def
*n_hi_before_if
= n_hi
;
517 nir_ssa_def
*q_hi_before_if
= q_hi
;
519 /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
520 * greater than 32 bits to occur. If the upper 32 bits of the numerator
521 * are zero, it is impossible for (denom << [63, 32]) <= numer unless
524 nir_ssa_def
*need_high_div
=
525 nir_iand(b
, nir_ieq(b
, d_hi
, nir_imm_int(b
, 0)), nir_uge(b
, n_hi
, d_lo
));
526 nir_push_if(b
, nir_bany(b
, need_high_div
));
528 /* If we only have one component, then the bany above goes away and
529 * this is always true within the if statement.
531 if (n
->num_components
== 1)
532 need_high_div
= nir_imm_true(b
);
534 nir_ssa_def
*log2_d_lo
= nir_ufind_msb(b
, d_lo
);
536 for (int i
= 31; i
>= 0; i
--) {
537 /* if ((d.x << i) <= n.y) {
542 nir_ssa_def
*d_shift
= nir_ishl(b
, d_lo
, nir_imm_int(b
, i
));
543 nir_ssa_def
*new_n_hi
= nir_isub(b
, n_hi
, d_shift
);
544 nir_ssa_def
*new_q_hi
= nir_ior(b
, q_hi
, nir_imm_int(b
, 1u << i
));
545 nir_ssa_def
*cond
= nir_iand(b
, need_high_div
,
546 nir_uge(b
, n_hi
, d_shift
));
548 /* log2_d_lo is always <= 31, so we don't need to bother with it
549 * in the last iteration.
551 cond
= nir_iand(b
, cond
,
552 nir_ige(b
, nir_imm_int(b
, 31 - i
), log2_d_lo
));
554 n_hi
= nir_bcsel(b
, cond
, new_n_hi
, n_hi
);
555 q_hi
= nir_bcsel(b
, cond
, new_q_hi
, q_hi
);
559 n_hi
= nir_if_phi(b
, n_hi
, n_hi_before_if
);
560 q_hi
= nir_if_phi(b
, q_hi
, q_hi_before_if
);
562 nir_ssa_def
*log2_denom
= nir_ufind_msb(b
, d_hi
);
564 n
= nir_pack_64_2x32_split(b
, n_lo
, n_hi
);
565 d
= nir_pack_64_2x32_split(b
, d_lo
, d_hi
);
566 for (int i
= 31; i
>= 0; i
--) {
567 /* if ((d64 << i) <= n64) {
572 nir_ssa_def
*d_shift
= nir_ishl(b
, d
, nir_imm_int(b
, i
));
573 nir_ssa_def
*new_n
= nir_isub(b
, n
, d_shift
);
574 nir_ssa_def
*new_q_lo
= nir_ior(b
, q_lo
, nir_imm_int(b
, 1u << i
));
575 nir_ssa_def
*cond
= nir_uge(b
, n
, d_shift
);
577 /* log2_denom is always <= 31, so we don't need to bother with it
578 * in the last iteration.
580 cond
= nir_iand(b
, cond
,
581 nir_ige(b
, nir_imm_int(b
, 31 - i
), log2_denom
));
583 n
= nir_bcsel(b
, cond
, new_n
, n
);
584 q_lo
= nir_bcsel(b
, cond
, new_q_lo
, q_lo
);
587 *q
= nir_pack_64_2x32_split(b
, q_lo
, q_hi
);
592 lower_udiv64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
595 lower_udiv64_mod64(b
, n
, d
, &q
, &r
);
600 lower_idiv64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
602 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
603 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
605 nir_ssa_def
*negate
= nir_ine(b
, nir_ilt(b
, n_hi
, nir_imm_int(b
, 0)),
606 nir_ilt(b
, d_hi
, nir_imm_int(b
, 0)));
608 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
609 return nir_bcsel(b
, negate
, nir_ineg(b
, q
), q
);
613 lower_umod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
616 lower_udiv64_mod64(b
, n
, d
, &q
, &r
);
621 lower_imod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
623 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
624 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
625 nir_ssa_def
*n_is_neg
= nir_ilt(b
, n_hi
, nir_imm_int(b
, 0));
626 nir_ssa_def
*d_is_neg
= nir_ilt(b
, d_hi
, nir_imm_int(b
, 0));
629 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
631 nir_ssa_def
*rem
= nir_bcsel(b
, n_is_neg
, nir_ineg(b
, r
), r
);
633 return nir_bcsel(b
, nir_ieq(b
, r
, nir_imm_int64(b
, 0)), nir_imm_int64(b
, 0),
634 nir_bcsel(b
, nir_ieq(b
, n_is_neg
, d_is_neg
), rem
,
635 nir_iadd(b
, rem
, d
)));
639 lower_irem64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
641 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
642 nir_ssa_def
*n_is_neg
= nir_ilt(b
, n_hi
, nir_imm_int(b
, 0));
645 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
646 return nir_bcsel(b
, n_is_neg
, nir_ineg(b
, r
), r
);
650 lower_extract(nir_builder
*b
, nir_op op
, nir_ssa_def
*x
, nir_ssa_def
*c
)
652 assert(op
== nir_op_extract_u8
|| op
== nir_op_extract_i8
||
653 op
== nir_op_extract_u16
|| op
== nir_op_extract_i16
);
655 const int chunk
= nir_src_as_uint(nir_src_for_ssa(c
));
656 const int chunk_bits
=
657 (op
== nir_op_extract_u8
|| op
== nir_op_extract_i8
) ? 8 : 16;
658 const int num_chunks_in_32
= 32 / chunk_bits
;
660 nir_ssa_def
*extract32
;
661 if (chunk
< num_chunks_in_32
) {
662 extract32
= nir_build_alu(b
, op
, nir_unpack_64_2x32_split_x(b
, x
),
663 nir_imm_int(b
, chunk
),
666 extract32
= nir_build_alu(b
, op
, nir_unpack_64_2x32_split_y(b
, x
),
667 nir_imm_int(b
, chunk
- num_chunks_in_32
),
671 if (op
== nir_op_extract_i8
|| op
== nir_op_extract_i16
)
672 return lower_i2i64(b
, extract32
);
674 return lower_u2u64(b
, extract32
);
678 lower_ufind_msb64(nir_builder
*b
, nir_ssa_def
*x
)
681 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
682 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
683 nir_ssa_def
*lo_count
= nir_ufind_msb(b
, x_lo
);
684 nir_ssa_def
*hi_count
= nir_ufind_msb(b
, x_hi
);
685 nir_ssa_def
*valid_hi_bits
= nir_ine(b
, x_hi
, nir_imm_int(b
, 0));
686 nir_ssa_def
*hi_res
= nir_iadd(b
, nir_imm_intN_t(b
, 32, 32), hi_count
);
687 return nir_bcsel(b
, valid_hi_bits
, hi_res
, lo_count
);
691 lower_2f(nir_builder
*b
, nir_ssa_def
*x
, unsigned dest_bit_size
,
694 nir_ssa_def
*x_sign
= NULL
;
697 x_sign
= nir_bcsel(b
, COND_LOWER_CMP(b
, ilt
, x
, nir_imm_int64(b
, 0)),
698 nir_imm_floatN_t(b
, -1, dest_bit_size
),
699 nir_imm_floatN_t(b
, 1, dest_bit_size
));
700 x
= COND_LOWER_OP(b
, iabs
, x
);
703 nir_ssa_def
*exp
= COND_LOWER_OP(b
, ufind_msb
, x
);
704 unsigned significand_bits
;
706 switch (dest_bit_size
) {
708 significand_bits
= 23;
711 significand_bits
= 10;
714 unreachable("Invalid dest_bit_size");
717 nir_ssa_def
*discard
=
718 nir_imax(b
, nir_isub(b
, exp
, nir_imm_int(b
, significand_bits
)),
720 nir_ssa_def
*significand
=
721 COND_LOWER_CAST(b
, u2u32
, COND_LOWER_OP(b
, ushr
, x
, discard
));
723 /* Round-to-nearest-even implementation:
724 * - if the non-representable part of the significand is higher than half
725 * the minimum representable significand, we round-up
726 * - if the non-representable part of the significand is equal to half the
727 * minimum representable significand and the representable part of the
728 * significand is odd, we round-up
729 * - in any other case, we round-down
731 nir_ssa_def
*lsb_mask
= COND_LOWER_OP(b
, ishl
, nir_imm_int64(b
, 1), discard
);
732 nir_ssa_def
*rem_mask
= COND_LOWER_OP(b
, isub
, lsb_mask
, nir_imm_int64(b
, 1));
733 nir_ssa_def
*half
= COND_LOWER_OP(b
, ishr
, lsb_mask
, nir_imm_int(b
, 1));
734 nir_ssa_def
*rem
= COND_LOWER_OP(b
, iand
, x
, rem_mask
);
735 nir_ssa_def
*halfway
= nir_iand(b
, COND_LOWER_CMP(b
, ieq
, rem
, half
),
736 nir_ine(b
, discard
, nir_imm_int(b
, 0)));
737 nir_ssa_def
*is_odd
= nir_i2b(b
, nir_iand(b
, significand
, nir_imm_int(b
, 1)));
738 nir_ssa_def
*round_up
= nir_ior(b
, COND_LOWER_CMP(b
, ilt
, half
, rem
),
739 nir_iand(b
, halfway
, is_odd
));
740 significand
= nir_iadd(b
, significand
, nir_b2i32(b
, round_up
));
744 if (dest_bit_size
== 32)
745 res
= nir_fmul(b
, nir_u2f32(b
, significand
),
746 nir_fexp2(b
, nir_u2f32(b
, discard
)));
748 res
= nir_fmul(b
, nir_u2f16(b
, significand
),
749 nir_fexp2(b
, nir_u2f16(b
, discard
)));
752 res
= nir_fmul(b
, res
, x_sign
);
758 lower_f2(nir_builder
*b
, nir_ssa_def
*x
, bool dst_is_signed
)
760 assert(x
->bit_size
== 16 || x
->bit_size
== 32);
761 nir_ssa_def
*x_sign
= NULL
;
764 x_sign
= nir_fsign(b
, x
);
766 x
= nir_fmin(b
, x
, nir_imm_floatN_t(b
, UINT64_MAX
, x
->bit_size
));
768 x
= nir_ftrunc(b
, x
);
771 x
= nir_fmin(b
, x
, nir_imm_floatN_t(b
, INT64_MAX
, x
->bit_size
));
772 x
= nir_fmax(b
, x
, nir_imm_floatN_t(b
, INT64_MIN
, x
->bit_size
));
776 nir_ssa_def
*div
= nir_imm_floatN_t(b
, 1ULL << 32, x
->bit_size
);
777 nir_ssa_def
*res_hi
= nir_f2u32(b
, nir_fdiv(b
, x
, div
));
778 nir_ssa_def
*res_lo
= nir_f2u32(b
, nir_frem(b
, x
, div
));
779 nir_ssa_def
*res
= nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
782 res
= nir_bcsel(b
, nir_flt(b
, x_sign
, nir_imm_float(b
, 0)),
783 nir_ineg(b
, res
), res
);
789 lower_bit_count64(nir_builder
*b
, nir_ssa_def
*x
)
791 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
792 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
793 nir_ssa_def
*lo_count
= nir_bit_count(b
, x_lo
);
794 nir_ssa_def
*hi_count
= nir_bit_count(b
, x_hi
);
795 return nir_iadd(b
, lo_count
, hi_count
);
798 nir_lower_int64_options
799 nir_lower_int64_op_to_options_mask(nir_op opcode
)
804 return nir_lower_imul64
;
805 case nir_op_imul_2x32_64
:
806 case nir_op_umul_2x32_64
:
807 return nir_lower_imul_2x32_64
;
808 case nir_op_imul_high
:
809 case nir_op_umul_high
:
810 return nir_lower_imul_high64
;
812 return nir_lower_isign64
;
818 return nir_lower_divmod64
;
836 return nir_lower_mov64
;
843 return nir_lower_icmp64
;
846 return nir_lower_iadd64
;
851 return nir_lower_minmax64
;
853 return nir_lower_iabs64
;
855 return nir_lower_ineg64
;
860 return nir_lower_logic64
;
864 return nir_lower_shift64
;
865 case nir_op_extract_u8
:
866 case nir_op_extract_i8
:
867 case nir_op_extract_u16
:
868 case nir_op_extract_i16
:
869 return nir_lower_extract64
;
870 case nir_op_ufind_msb
:
871 return nir_lower_ufind_msb64
;
872 case nir_op_bit_count
:
873 return nir_lower_bit_count64
;
880 lower_int64_alu_instr(nir_builder
*b
, nir_instr
*instr
, void *_state
)
882 nir_alu_instr
*alu
= nir_instr_as_alu(instr
);
885 for (unsigned i
= 0; i
< nir_op_infos
[alu
->op
].num_inputs
; i
++)
886 src
[i
] = nir_ssa_for_alu_src(b
, alu
, i
);
891 return lower_imul64(b
, src
[0], src
[1]);
892 case nir_op_imul_2x32_64
:
893 return lower_mul_2x32_64(b
, src
[0], src
[1], true);
894 case nir_op_umul_2x32_64
:
895 return lower_mul_2x32_64(b
, src
[0], src
[1], false);
896 case nir_op_imul_high
:
897 return lower_mul_high64(b
, src
[0], src
[1], true);
898 case nir_op_umul_high
:
899 return lower_mul_high64(b
, src
[0], src
[1], false);
901 return lower_isign64(b
, src
[0]);
903 return lower_udiv64(b
, src
[0], src
[1]);
905 return lower_idiv64(b
, src
[0], src
[1]);
907 return lower_umod64(b
, src
[0], src
[1]);
909 return lower_imod64(b
, src
[0], src
[1]);
911 return lower_irem64(b
, src
[0], src
[1]);
913 return lower_b2i64(b
, src
[0]);
915 return lower_i2b(b
, src
[0]);
917 return lower_i2i8(b
, src
[0]);
919 return lower_i2i16(b
, src
[0]);
921 return lower_i2i32(b
, src
[0]);
923 return lower_i2i64(b
, src
[0]);
925 return lower_u2u8(b
, src
[0]);
927 return lower_u2u16(b
, src
[0]);
929 return lower_u2u32(b
, src
[0]);
931 return lower_u2u64(b
, src
[0]);
933 return lower_bcsel64(b
, src
[0], src
[1], src
[2]);
940 return lower_int64_compare(b
, alu
->op
, src
[0], src
[1]);
942 return lower_iadd64(b
, src
[0], src
[1]);
944 return lower_isub64(b
, src
[0], src
[1]);
946 return lower_imin64(b
, src
[0], src
[1]);
948 return lower_imax64(b
, src
[0], src
[1]);
950 return lower_umin64(b
, src
[0], src
[1]);
952 return lower_umax64(b
, src
[0], src
[1]);
954 return lower_iabs64(b
, src
[0]);
956 return lower_ineg64(b
, src
[0]);
958 return lower_iand64(b
, src
[0], src
[1]);
960 return lower_ior64(b
, src
[0], src
[1]);
962 return lower_ixor64(b
, src
[0], src
[1]);
964 return lower_inot64(b
, src
[0]);
966 return lower_ishl64(b
, src
[0], src
[1]);
968 return lower_ishr64(b
, src
[0], src
[1]);
970 return lower_ushr64(b
, src
[0], src
[1]);
971 case nir_op_extract_u8
:
972 case nir_op_extract_i8
:
973 case nir_op_extract_u16
:
974 case nir_op_extract_i16
:
975 return lower_extract(b
, alu
->op
, src
[0], src
[1]);
976 case nir_op_ufind_msb
:
977 return lower_ufind_msb64(b
, src
[0]);
978 case nir_op_bit_count
:
979 return lower_bit_count64(b
, src
[0]);
983 return lower_2f(b
, src
[0], nir_dest_bit_size(alu
->dest
.dest
), true);
987 return lower_2f(b
, src
[0], nir_dest_bit_size(alu
->dest
.dest
), false);
990 /* We don't support f64toi64 (yet?). */
991 if (src
[0]->bit_size
> 32)
994 return lower_f2(b
, src
[0], alu
->op
== nir_op_f2i64
);
996 unreachable("Invalid ALU opcode to lower");
1001 should_lower_int64_alu_instr(const nir_instr
*instr
, const void *_data
)
1003 const nir_shader_compiler_options
*options
=
1004 (const nir_shader_compiler_options
*)_data
;
1006 if (instr
->type
!= nir_instr_type_alu
)
1009 const nir_alu_instr
*alu
= nir_instr_as_alu(instr
);
1019 assert(alu
->src
[0].src
.is_ssa
);
1020 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1024 assert(alu
->src
[1].src
.is_ssa
);
1025 assert(alu
->src
[2].src
.is_ssa
);
1026 assert(alu
->src
[1].src
.ssa
->bit_size
==
1027 alu
->src
[2].src
.ssa
->bit_size
);
1028 if (alu
->src
[1].src
.ssa
->bit_size
!= 64)
1037 assert(alu
->src
[0].src
.is_ssa
);
1038 assert(alu
->src
[1].src
.is_ssa
);
1039 assert(alu
->src
[0].src
.ssa
->bit_size
==
1040 alu
->src
[1].src
.ssa
->bit_size
);
1041 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1044 case nir_op_ufind_msb
:
1045 case nir_op_bit_count
:
1046 assert(alu
->src
[0].src
.is_ssa
);
1047 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1051 assert(alu
->dest
.dest
.is_ssa
);
1052 if (options
->has_imul24
)
1054 if (alu
->dest
.dest
.ssa
.bit_size
!= 64)
1063 assert(alu
->src
[0].src
.is_ssa
);
1064 if (alu
->src
[0].src
.ssa
->bit_size
!= 64)
1071 assert(alu
->dest
.dest
.is_ssa
);
1072 if (alu
->dest
.dest
.ssa
.bit_size
!= 64)
1077 unsigned mask
= nir_lower_int64_op_to_options_mask(alu
->op
);
1078 return (options
->lower_int64_options
& mask
) != 0;
1082 nir_lower_int64(nir_shader
*shader
)
1084 return nir_shader_lower_instructions(shader
,
1085 should_lower_int64_alu_instr
,
1086 lower_int64_alu_instr
,
1087 (void *)shader
->options
);