nir: Wire up int64 lowering functions
[mesa.git] / src / compiler / nir / nir_lower_int64.c
1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26
27 static nir_ssa_def *
28 lower_b2i64(nir_builder *b, nir_ssa_def *x)
29 {
30 return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0));
31 }
32
33 static nir_ssa_def *
34 lower_i2b(nir_builder *b, nir_ssa_def *x)
35 {
36 return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x),
37 nir_unpack_64_2x32_split_y(b, x)),
38 nir_imm_int(b, 0));
39 }
40
41 static nir_ssa_def *
42 lower_i2i8(nir_builder *b, nir_ssa_def *x)
43 {
44 return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x));
45 }
46
47 static nir_ssa_def *
48 lower_i2i16(nir_builder *b, nir_ssa_def *x)
49 {
50 return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x));
51 }
52
53
54 static nir_ssa_def *
55 lower_i2i32(nir_builder *b, nir_ssa_def *x)
56 {
57 return nir_unpack_64_2x32_split_x(b, x);
58 }
59
60 static nir_ssa_def *
61 lower_i2i64(nir_builder *b, nir_ssa_def *x)
62 {
63 nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x);
64 return nir_pack_64_2x32_split(b, x32, nir_ishr(b, x32, nir_imm_int(b, 31)));
65 }
66
67 static nir_ssa_def *
68 lower_u2u8(nir_builder *b, nir_ssa_def *x)
69 {
70 return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x));
71 }
72
73 static nir_ssa_def *
74 lower_u2u16(nir_builder *b, nir_ssa_def *x)
75 {
76 return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x));
77 }
78
79 static nir_ssa_def *
80 lower_u2u32(nir_builder *b, nir_ssa_def *x)
81 {
82 return nir_unpack_64_2x32_split_x(b, x);
83 }
84
85 static nir_ssa_def *
86 lower_u2u64(nir_builder *b, nir_ssa_def *x)
87 {
88 nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x);
89 return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0));
90 }
91
92 static nir_ssa_def *
93 lower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y)
94 {
95 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
96 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
97 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
98 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
99
100 return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo),
101 nir_bcsel(b, cond, x_hi, y_hi));
102 }
103
104 static nir_ssa_def *
105 lower_inot64(nir_builder *b, nir_ssa_def *x)
106 {
107 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
108 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
109
110 return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi));
111 }
112
113 static nir_ssa_def *
114 lower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
115 {
116 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
117 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
118 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
119 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
120
121 return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo),
122 nir_iand(b, x_hi, y_hi));
123 }
124
125 static nir_ssa_def *
126 lower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
127 {
128 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
129 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
130 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
131 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
132
133 return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo),
134 nir_ior(b, x_hi, y_hi));
135 }
136
137 static nir_ssa_def *
138 lower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
139 {
140 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
141 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
142 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
143 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
144
145 return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo),
146 nir_ixor(b, x_hi, y_hi));
147 }
148
149 static nir_ssa_def *
150 lower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
151 {
152 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
153 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
154 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
155 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
156
157 nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo);
158 nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo));
159 nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi));
160
161 return nir_pack_64_2x32_split(b, res_lo, res_hi);
162 }
163
164 static nir_ssa_def *
165 lower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
166 {
167 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
168 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
169 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
170 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
171
172 nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo);
173 nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo)));
174 nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow);
175
176 return nir_pack_64_2x32_split(b, res_lo, res_hi);
177 }
178
179 static nir_ssa_def *
180 lower_ineg64(nir_builder *b, nir_ssa_def *x)
181 {
182 /* Since isub is the same number of instructions (with better dependencies)
183 * as iadd, subtraction is actually more efficient for ineg than the usual
184 * 2's complement "flip the bits and add one".
185 */
186 return lower_isub64(b, nir_imm_int64(b, 0), x);
187 }
188
189 static nir_ssa_def *
190 lower_iabs64(nir_builder *b, nir_ssa_def *x)
191 {
192 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
193 nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0));
194 return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x);
195 }
196
197 static nir_ssa_def *
198 lower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y)
199 {
200 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
201 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
202 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
203 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
204
205 switch (op) {
206 case nir_op_ieq:
207 return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo));
208 case nir_op_ine:
209 return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo));
210 case nir_op_ult:
211 return nir_ior(b, nir_ult(b, x_hi, y_hi),
212 nir_iand(b, nir_ieq(b, x_hi, y_hi),
213 nir_ult(b, x_lo, y_lo)));
214 case nir_op_ilt:
215 return nir_ior(b, nir_ilt(b, x_hi, y_hi),
216 nir_iand(b, nir_ieq(b, x_hi, y_hi),
217 nir_ult(b, x_lo, y_lo)));
218 break;
219 case nir_op_uge:
220 /* Lower as !(x < y) in the hopes of better CSE */
221 return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y));
222 case nir_op_ige:
223 /* Lower as !(x < y) in the hopes of better CSE */
224 return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y));
225 default:
226 unreachable("Invalid comparison");
227 }
228 }
229
230 static nir_ssa_def *
231 lower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
232 {
233 return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x);
234 }
235
236 static nir_ssa_def *
237 lower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
238 {
239 return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x);
240 }
241
242 static nir_ssa_def *
243 lower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
244 {
245 return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y);
246 }
247
248 static nir_ssa_def *
249 lower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
250 {
251 return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y);
252 }
253
254 static nir_ssa_def *
255 lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
256 {
257 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
258 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
259 nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
260 nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
261
262 nir_ssa_def *res_lo = nir_imul(b, x_lo, y_lo);
263 nir_ssa_def *res_hi = nir_iadd(b, nir_umul_high(b, x_lo, y_lo),
264 nir_iadd(b, nir_imul(b, x_lo, y_hi),
265 nir_imul(b, x_hi, y_lo)));
266
267 return nir_pack_64_2x32_split(b, res_lo, res_hi);
268 }
269
270 static nir_ssa_def *
271 lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
272 bool sign_extend)
273 {
274 nir_ssa_def *x32[4], *y32[4];
275 x32[0] = nir_unpack_64_2x32_split_x(b, x);
276 x32[1] = nir_unpack_64_2x32_split_y(b, x);
277 if (sign_extend) {
278 x32[2] = x32[3] = nir_ishr(b, x32[1], nir_imm_int(b, 31));
279 } else {
280 x32[2] = x32[3] = nir_imm_int(b, 0);
281 }
282
283 y32[0] = nir_unpack_64_2x32_split_x(b, y);
284 y32[1] = nir_unpack_64_2x32_split_y(b, y);
285 if (sign_extend) {
286 y32[2] = y32[3] = nir_ishr(b, y32[1], nir_imm_int(b, 31));
287 } else {
288 y32[2] = y32[3] = nir_imm_int(b, 0);
289 }
290
291 nir_ssa_def *res[8] = { NULL, };
292
293 /* Yes, the following generates a pile of code. However, we throw res[0]
294 * and res[1] away in the end and, if we're in the umul case, four of our
295 * eight dword operands will be constant zero and opt_algebraic will clean
296 * this up nicely.
297 */
298 for (unsigned i = 0; i < 4; i++) {
299 nir_ssa_def *carry = NULL;
300 for (unsigned j = 0; j < 4; j++) {
301 /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
302 * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum
303 * value that will fit in tmp is
304 *
305 * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
306 * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
307 * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
308 *
309 * so we're guaranteed that we can add in two more 32-bit values
310 * without overflowing tmp.
311 */
312 nir_ssa_def *tmp =
313 nir_pack_64_2x32_split(b, nir_imul(b, x32[i], y32[j]),
314 nir_umul_high(b, x32[i], y32[j]));
315 if (res[i + j])
316 tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
317 if (carry)
318 tmp = nir_iadd(b, tmp, carry);
319 res[i + j] = nir_u2u32(b, tmp);
320 carry = nir_ushr(b, tmp, nir_imm_int(b, 32));
321 }
322 res[i + 4] = nir_u2u32(b, carry);
323 }
324
325 return nir_pack_64_2x32_split(b, res[2], res[3]);
326 }
327
328 static nir_ssa_def *
329 lower_isign64(nir_builder *b, nir_ssa_def *x)
330 {
331 nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
332 nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
333
334 nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi));
335 nir_ssa_def *res_hi = nir_ishr(b, x_hi, nir_imm_int(b, 31));
336 nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero));
337
338 return nir_pack_64_2x32_split(b, res_lo, res_hi);
339 }
340
341 static void
342 lower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d,
343 nir_ssa_def **q, nir_ssa_def **r)
344 {
345 /* TODO: We should specially handle the case where the denominator is a
346 * constant. In that case, we should be able to reduce it to a multiply by
347 * a constant, some shifts, and an add.
348 */
349 nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n);
350 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
351 nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d);
352 nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
353
354 nir_const_value v = { .u32 = { 0, 0, 0, 0 } };
355 nir_ssa_def *q_lo = nir_build_imm(b, n->num_components, 32, v);
356 nir_ssa_def *q_hi = nir_build_imm(b, n->num_components, 32, v);
357
358 nir_ssa_def *n_hi_before_if = n_hi;
359 nir_ssa_def *q_hi_before_if = q_hi;
360
361 /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
362 * greater than 32 bits to occur. If the upper 32 bits of the numerator
363 * are zero, it is impossible for (denom << [63, 32]) <= numer unless
364 * denom == 0.
365 */
366 nir_ssa_def *need_high_div =
367 nir_iand(b, nir_ieq(b, d_hi, nir_imm_int(b, 0)), nir_uge(b, n_hi, d_lo));
368 nir_push_if(b, nir_bany(b, need_high_div));
369 {
370 /* If we only have one component, then the bany above goes away and
371 * this is always true within the if statement.
372 */
373 if (n->num_components == 1)
374 need_high_div = nir_imm_true(b);
375
376 nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo);
377
378 for (int i = 31; i >= 0; i--) {
379 /* if ((d.x << i) <= n.y) {
380 * n.y -= d.x << i;
381 * quot.y |= 1U << i;
382 * }
383 */
384 nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i));
385 nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift);
386 nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i));
387 nir_ssa_def *cond = nir_iand(b, need_high_div,
388 nir_uge(b, n_hi, d_shift));
389 if (i != 0) {
390 /* log2_d_lo is always <= 31, so we don't need to bother with it
391 * in the last iteration.
392 */
393 cond = nir_iand(b, cond,
394 nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo));
395 }
396 n_hi = nir_bcsel(b, cond, new_n_hi, n_hi);
397 q_hi = nir_bcsel(b, cond, new_q_hi, q_hi);
398 }
399 }
400 nir_pop_if(b, NULL);
401 n_hi = nir_if_phi(b, n_hi, n_hi_before_if);
402 q_hi = nir_if_phi(b, q_hi, q_hi_before_if);
403
404 nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi);
405
406 n = nir_pack_64_2x32_split(b, n_lo, n_hi);
407 d = nir_pack_64_2x32_split(b, d_lo, d_hi);
408 for (int i = 31; i >= 0; i--) {
409 /* if ((d64 << i) <= n64) {
410 * n64 -= d64 << i;
411 * quot.x |= 1U << i;
412 * }
413 */
414 nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i));
415 nir_ssa_def *new_n = nir_isub(b, n, d_shift);
416 nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i));
417 nir_ssa_def *cond = nir_uge(b, n, d_shift);
418 if (i != 0) {
419 /* log2_denom is always <= 31, so we don't need to bother with it
420 * in the last iteration.
421 */
422 cond = nir_iand(b, cond,
423 nir_ige(b, nir_imm_int(b, 31 - i), log2_denom));
424 }
425 n = nir_bcsel(b, cond, new_n, n);
426 q_lo = nir_bcsel(b, cond, new_q_lo, q_lo);
427 }
428
429 *q = nir_pack_64_2x32_split(b, q_lo, q_hi);
430 *r = n;
431 }
432
433 static nir_ssa_def *
434 lower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
435 {
436 nir_ssa_def *q, *r;
437 lower_udiv64_mod64(b, n, d, &q, &r);
438 return q;
439 }
440
441 static nir_ssa_def *
442 lower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
443 {
444 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
445 nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
446
447 nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)),
448 nir_ilt(b, d_hi, nir_imm_int(b, 0)));
449 nir_ssa_def *q, *r;
450 lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
451 return nir_bcsel(b, negate, nir_ineg(b, q), q);
452 }
453
454 static nir_ssa_def *
455 lower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
456 {
457 nir_ssa_def *q, *r;
458 lower_udiv64_mod64(b, n, d, &q, &r);
459 return r;
460 }
461
462 static nir_ssa_def *
463 lower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
464 {
465 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
466 nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
467 nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
468 nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0));
469
470 nir_ssa_def *q, *r;
471 lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
472
473 nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
474
475 return nir_bcsel(b, nir_ieq(b, r, nir_imm_int64(b, 0)), nir_imm_int64(b, 0),
476 nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem,
477 nir_iadd(b, rem, d)));
478 }
479
480 static nir_ssa_def *
481 lower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
482 {
483 nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
484 nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
485
486 nir_ssa_def *q, *r;
487 lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
488 return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
489 }
490
491 static nir_lower_int64_options
492 opcode_to_options_mask(nir_op opcode)
493 {
494 switch (opcode) {
495 case nir_op_imul:
496 return nir_lower_imul64;
497 case nir_op_imul_high:
498 case nir_op_umul_high:
499 return nir_lower_imul_high64;
500 case nir_op_isign:
501 return nir_lower_isign64;
502 case nir_op_udiv:
503 case nir_op_idiv:
504 case nir_op_umod:
505 case nir_op_imod:
506 case nir_op_irem:
507 return nir_lower_divmod64;
508 case nir_op_b2i64:
509 case nir_op_i2b1:
510 case nir_op_i2i32:
511 case nir_op_i2i64:
512 case nir_op_u2u32:
513 case nir_op_u2u64:
514 case nir_op_bcsel:
515 return nir_lower_mov64;
516 case nir_op_ieq:
517 case nir_op_ine:
518 case nir_op_ult:
519 case nir_op_ilt:
520 case nir_op_uge:
521 case nir_op_ige:
522 return nir_lower_icmp64;
523 case nir_op_iadd:
524 case nir_op_isub:
525 return nir_lower_iadd64;
526 case nir_op_imin:
527 case nir_op_imax:
528 case nir_op_umin:
529 case nir_op_umax:
530 return nir_lower_minmax64;
531 case nir_op_iabs:
532 return nir_lower_iabs64;
533 case nir_op_ineg:
534 return nir_lower_ineg64;
535 case nir_op_iand:
536 case nir_op_ior:
537 case nir_op_ixor:
538 case nir_op_inot:
539 return nir_lower_logic64;
540 default:
541 return 0;
542 }
543 }
544
545 static nir_ssa_def *
546 lower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu)
547 {
548 nir_ssa_def *src[4];
549 for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
550 src[i] = nir_ssa_for_alu_src(b, alu, i);
551
552 switch (alu->op) {
553 case nir_op_imul:
554 return lower_imul64(b, src[0], src[1]);
555 case nir_op_imul_high:
556 return lower_mul_high64(b, src[0], src[1], true);
557 case nir_op_umul_high:
558 return lower_mul_high64(b, src[0], src[1], false);
559 case nir_op_isign:
560 return lower_isign64(b, src[0]);
561 case nir_op_udiv:
562 return lower_udiv64(b, src[0], src[1]);
563 case nir_op_idiv:
564 return lower_idiv64(b, src[0], src[1]);
565 case nir_op_umod:
566 return lower_umod64(b, src[0], src[1]);
567 case nir_op_imod:
568 return lower_imod64(b, src[0], src[1]);
569 case nir_op_irem:
570 return lower_irem64(b, src[0], src[1]);
571 case nir_op_b2i64:
572 return lower_b2i64(b, src[0]);
573 case nir_op_i2b1:
574 return lower_i2b(b, src[0]);
575 case nir_op_i2i8:
576 return lower_i2i8(b, src[0]);
577 case nir_op_i2i16:
578 return lower_i2i16(b, src[0]);
579 case nir_op_i2i32:
580 return lower_i2i32(b, src[0]);
581 case nir_op_i2i64:
582 return lower_i2i64(b, src[0]);
583 case nir_op_u2u8:
584 return lower_u2u8(b, src[0]);
585 case nir_op_u2u16:
586 return lower_u2u16(b, src[0]);
587 case nir_op_u2u32:
588 return lower_u2u32(b, src[0]);
589 case nir_op_u2u64:
590 return lower_u2u64(b, src[0]);
591 case nir_op_bcsel:
592 return lower_bcsel64(b, src[0], src[1], src[2]);
593 case nir_op_ieq:
594 case nir_op_ine:
595 case nir_op_ult:
596 case nir_op_ilt:
597 case nir_op_uge:
598 case nir_op_ige:
599 return lower_int64_compare(b, alu->op, src[0], src[1]);
600 case nir_op_iadd:
601 return lower_iadd64(b, src[0], src[1]);
602 case nir_op_isub:
603 return lower_isub64(b, src[0], src[1]);
604 case nir_op_imin:
605 return lower_imin64(b, src[0], src[1]);
606 case nir_op_imax:
607 return lower_imax64(b, src[0], src[1]);
608 case nir_op_umin:
609 return lower_umin64(b, src[0], src[1]);
610 case nir_op_umax:
611 return lower_umax64(b, src[0], src[1]);
612 case nir_op_iabs:
613 return lower_iabs64(b, src[0]);
614 case nir_op_ineg:
615 return lower_ineg64(b, src[0]);
616 case nir_op_iand:
617 return lower_iand64(b, src[0], src[1]);
618 case nir_op_ior:
619 return lower_ior64(b, src[0], src[1]);
620 case nir_op_ixor:
621 return lower_ixor64(b, src[0], src[1]);
622 case nir_op_inot:
623 return lower_inot64(b, src[0]);
624 default:
625 unreachable("Invalid ALU opcode to lower");
626 }
627 }
628
629 static bool
630 lower_int64_impl(nir_function_impl *impl, nir_lower_int64_options options)
631 {
632 nir_builder b;
633 nir_builder_init(&b, impl);
634
635 bool progress = false;
636 nir_foreach_block(block, impl) {
637 nir_foreach_instr_safe(instr, block) {
638 if (instr->type != nir_instr_type_alu)
639 continue;
640
641 nir_alu_instr *alu = nir_instr_as_alu(instr);
642 switch (alu->op) {
643 case nir_op_i2b1:
644 case nir_op_i2i32:
645 case nir_op_u2u32:
646 assert(alu->src[0].src.is_ssa);
647 if (alu->src[0].src.ssa->bit_size != 64)
648 continue;
649 break;
650 case nir_op_bcsel:
651 assert(alu->src[1].src.is_ssa);
652 assert(alu->src[2].src.is_ssa);
653 assert(alu->src[1].src.ssa->bit_size ==
654 alu->src[2].src.ssa->bit_size);
655 if (alu->src[1].src.ssa->bit_size != 64)
656 continue;
657 break;
658 case nir_op_ieq:
659 case nir_op_ine:
660 case nir_op_ult:
661 case nir_op_ilt:
662 case nir_op_uge:
663 case nir_op_ige:
664 assert(alu->src[0].src.is_ssa);
665 assert(alu->src[1].src.is_ssa);
666 assert(alu->src[0].src.ssa->bit_size ==
667 alu->src[1].src.ssa->bit_size);
668 if (alu->src[0].src.ssa->bit_size != 64)
669 continue;
670 break;
671 default:
672 assert(alu->dest.dest.is_ssa);
673 if (alu->dest.dest.ssa.bit_size != 64)
674 continue;
675 break;
676 }
677
678 if (!(options & opcode_to_options_mask(alu->op)))
679 continue;
680
681 b.cursor = nir_before_instr(instr);
682
683 nir_ssa_def *lowered = lower_int64_alu_instr(&b, alu);
684 nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa,
685 nir_src_for_ssa(lowered));
686 nir_instr_remove(&alu->instr);
687 progress = true;
688 }
689 }
690
691 if (progress)
692 nir_metadata_preserve(impl, nir_metadata_none);
693
694 return progress;
695 }
696
697 bool
698 nir_lower_int64(nir_shader *shader, nir_lower_int64_options options)
699 {
700 bool progress = false;
701
702 nir_foreach_function(function, shader) {
703 if (function->impl)
704 progress |= lower_int64_impl(function->impl, options);
705 }
706
707 return progress;
708 }