2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "nir_builder.h"
28 lower_imul64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
)
30 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
31 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
32 nir_ssa_def
*y_lo
= nir_unpack_64_2x32_split_x(b
, y
);
33 nir_ssa_def
*y_hi
= nir_unpack_64_2x32_split_y(b
, y
);
35 nir_ssa_def
*res_lo
= nir_imul(b
, x_lo
, y_lo
);
36 nir_ssa_def
*res_hi
= nir_iadd(b
, nir_umul_high(b
, x_lo
, y_lo
),
37 nir_iadd(b
, nir_imul(b
, x_lo
, y_hi
),
38 nir_imul(b
, x_hi
, y_lo
)));
40 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
44 lower_mul_high64(nir_builder
*b
, nir_ssa_def
*x
, nir_ssa_def
*y
,
47 nir_ssa_def
*x32
[4], *y32
[4];
48 x32
[0] = nir_unpack_64_2x32_split_x(b
, x
);
49 x32
[1] = nir_unpack_64_2x32_split_y(b
, x
);
51 x32
[2] = x32
[3] = nir_ishr(b
, x32
[1], nir_imm_int(b
, 31));
53 x32
[2] = x32
[3] = nir_imm_int(b
, 0);
56 y32
[0] = nir_unpack_64_2x32_split_x(b
, y
);
57 y32
[1] = nir_unpack_64_2x32_split_y(b
, y
);
59 y32
[2] = y32
[3] = nir_ishr(b
, y32
[1], nir_imm_int(b
, 31));
61 y32
[2] = y32
[3] = nir_imm_int(b
, 0);
64 nir_ssa_def
*res
[8] = { NULL
, };
66 /* Yes, the following generates a pile of code. However, we throw res[0]
67 * and res[1] away in the end and, if we're in the umul case, four of our
68 * eight dword operands will be constant zero and opt_algebraic will clean
71 for (unsigned i
= 0; i
< 4; i
++) {
72 nir_ssa_def
*carry
= NULL
;
73 for (unsigned j
= 0; j
< 4; j
++) {
74 /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
75 * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum
76 * value that will fit in tmp is
78 * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
79 * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
80 * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
82 * so we're guaranteed that we can add in two more 32-bit values
83 * without overflowing tmp.
86 nir_pack_64_2x32_split(b
, nir_imul(b
, x32
[i
], y32
[j
]),
87 nir_umul_high(b
, x32
[i
], y32
[j
]));
89 tmp
= nir_iadd(b
, tmp
, nir_u2u64(b
, res
[i
+ j
]));
91 tmp
= nir_iadd(b
, tmp
, carry
);
92 res
[i
+ j
] = nir_u2u32(b
, tmp
);
93 carry
= nir_ushr(b
, tmp
, nir_imm_int(b
, 32));
95 res
[i
+ 4] = nir_u2u32(b
, carry
);
98 return nir_pack_64_2x32_split(b
, res
[2], res
[3]);
102 lower_isign64(nir_builder
*b
, nir_ssa_def
*x
)
104 nir_ssa_def
*x_lo
= nir_unpack_64_2x32_split_x(b
, x
);
105 nir_ssa_def
*x_hi
= nir_unpack_64_2x32_split_y(b
, x
);
107 nir_ssa_def
*is_non_zero
= nir_i2b(b
, nir_ior(b
, x_lo
, x_hi
));
108 nir_ssa_def
*res_hi
= nir_ishr(b
, x_hi
, nir_imm_int(b
, 31));
109 nir_ssa_def
*res_lo
= nir_ior(b
, res_hi
, nir_b2i32(b
, is_non_zero
));
111 return nir_pack_64_2x32_split(b
, res_lo
, res_hi
);
115 lower_udiv64_mod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
,
116 nir_ssa_def
**q
, nir_ssa_def
**r
)
118 /* TODO: We should specially handle the case where the denominator is a
119 * constant. In that case, we should be able to reduce it to a multiply by
120 * a constant, some shifts, and an add.
122 nir_ssa_def
*n_lo
= nir_unpack_64_2x32_split_x(b
, n
);
123 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
124 nir_ssa_def
*d_lo
= nir_unpack_64_2x32_split_x(b
, d
);
125 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
127 nir_const_value v
= { .u32
= { 0, 0, 0, 0 } };
128 nir_ssa_def
*q_lo
= nir_build_imm(b
, n
->num_components
, 32, v
);
129 nir_ssa_def
*q_hi
= nir_build_imm(b
, n
->num_components
, 32, v
);
131 nir_ssa_def
*n_hi_before_if
= n_hi
;
132 nir_ssa_def
*q_hi_before_if
= q_hi
;
134 /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
135 * greater than 32 bits to occur. If the upper 32 bits of the numerator
136 * are zero, it is impossible for (denom << [63, 32]) <= numer unless
139 nir_ssa_def
*need_high_div
=
140 nir_iand(b
, nir_ieq(b
, d_hi
, nir_imm_int(b
, 0)), nir_uge(b
, n_hi
, d_lo
));
141 nir_push_if(b
, nir_bany(b
, need_high_div
));
143 /* If we only have one component, then the bany above goes away and
144 * this is always true within the if statement.
146 if (n
->num_components
== 1)
147 need_high_div
= nir_imm_true(b
);
149 nir_ssa_def
*log2_d_lo
= nir_ufind_msb(b
, d_lo
);
151 for (int i
= 31; i
>= 0; i
--) {
152 /* if ((d.x << i) <= n.y) {
157 nir_ssa_def
*d_shift
= nir_ishl(b
, d_lo
, nir_imm_int(b
, i
));
158 nir_ssa_def
*new_n_hi
= nir_isub(b
, n_hi
, d_shift
);
159 nir_ssa_def
*new_q_hi
= nir_ior(b
, q_hi
, nir_imm_int(b
, 1u << i
));
160 nir_ssa_def
*cond
= nir_iand(b
, need_high_div
,
161 nir_uge(b
, n_hi
, d_shift
));
163 /* log2_d_lo is always <= 31, so we don't need to bother with it
164 * in the last iteration.
166 cond
= nir_iand(b
, cond
,
167 nir_ige(b
, nir_imm_int(b
, 31 - i
), log2_d_lo
));
169 n_hi
= nir_bcsel(b
, cond
, new_n_hi
, n_hi
);
170 q_hi
= nir_bcsel(b
, cond
, new_q_hi
, q_hi
);
174 n_hi
= nir_if_phi(b
, n_hi
, n_hi_before_if
);
175 q_hi
= nir_if_phi(b
, q_hi
, q_hi_before_if
);
177 nir_ssa_def
*log2_denom
= nir_ufind_msb(b
, d_hi
);
179 n
= nir_pack_64_2x32_split(b
, n_lo
, n_hi
);
180 d
= nir_pack_64_2x32_split(b
, d_lo
, d_hi
);
181 for (int i
= 31; i
>= 0; i
--) {
182 /* if ((d64 << i) <= n64) {
187 nir_ssa_def
*d_shift
= nir_ishl(b
, d
, nir_imm_int(b
, i
));
188 nir_ssa_def
*new_n
= nir_isub(b
, n
, d_shift
);
189 nir_ssa_def
*new_q_lo
= nir_ior(b
, q_lo
, nir_imm_int(b
, 1u << i
));
190 nir_ssa_def
*cond
= nir_uge(b
, n
, d_shift
);
192 /* log2_denom is always <= 31, so we don't need to bother with it
193 * in the last iteration.
195 cond
= nir_iand(b
, cond
,
196 nir_ige(b
, nir_imm_int(b
, 31 - i
), log2_denom
));
198 n
= nir_bcsel(b
, cond
, new_n
, n
);
199 q_lo
= nir_bcsel(b
, cond
, new_q_lo
, q_lo
);
202 *q
= nir_pack_64_2x32_split(b
, q_lo
, q_hi
);
207 lower_udiv64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
210 lower_udiv64_mod64(b
, n
, d
, &q
, &r
);
215 lower_idiv64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
217 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
218 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
220 nir_ssa_def
*negate
= nir_ine(b
, nir_ilt(b
, n_hi
, nir_imm_int(b
, 0)),
221 nir_ilt(b
, d_hi
, nir_imm_int(b
, 0)));
223 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
224 return nir_bcsel(b
, negate
, nir_ineg(b
, q
), q
);
228 lower_umod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
231 lower_udiv64_mod64(b
, n
, d
, &q
, &r
);
236 lower_imod64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
238 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
239 nir_ssa_def
*d_hi
= nir_unpack_64_2x32_split_y(b
, d
);
240 nir_ssa_def
*n_is_neg
= nir_ilt(b
, n_hi
, nir_imm_int(b
, 0));
241 nir_ssa_def
*d_is_neg
= nir_ilt(b
, d_hi
, nir_imm_int(b
, 0));
244 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
246 nir_ssa_def
*rem
= nir_bcsel(b
, n_is_neg
, nir_ineg(b
, r
), r
);
248 return nir_bcsel(b
, nir_ieq(b
, r
, nir_imm_int64(b
, 0)), nir_imm_int64(b
, 0),
249 nir_bcsel(b
, nir_ieq(b
, n_is_neg
, d_is_neg
), rem
,
250 nir_iadd(b
, rem
, d
)));
254 lower_irem64(nir_builder
*b
, nir_ssa_def
*n
, nir_ssa_def
*d
)
256 nir_ssa_def
*n_hi
= nir_unpack_64_2x32_split_y(b
, n
);
257 nir_ssa_def
*n_is_neg
= nir_ilt(b
, n_hi
, nir_imm_int(b
, 0));
260 lower_udiv64_mod64(b
, nir_iabs(b
, n
), nir_iabs(b
, d
), &q
, &r
);
261 return nir_bcsel(b
, n_is_neg
, nir_ineg(b
, r
), r
);
264 static nir_lower_int64_options
265 opcode_to_options_mask(nir_op opcode
)
269 return nir_lower_imul64
;
270 case nir_op_imul_high
:
271 case nir_op_umul_high
:
272 return nir_lower_imul_high64
;
274 return nir_lower_isign64
;
280 return nir_lower_divmod64
;
287 lower_int64_alu_instr(nir_builder
*b
, nir_alu_instr
*alu
)
290 for (unsigned i
= 0; i
< nir_op_infos
[alu
->op
].num_inputs
; i
++)
291 src
[i
] = nir_ssa_for_alu_src(b
, alu
, i
);
295 return lower_imul64(b
, src
[0], src
[1]);
296 case nir_op_imul_high
:
297 return lower_mul_high64(b
, src
[0], src
[1], true);
298 case nir_op_umul_high
:
299 return lower_mul_high64(b
, src
[0], src
[1], false);
301 return lower_isign64(b
, src
[0]);
303 return lower_udiv64(b
, src
[0], src
[1]);
305 return lower_idiv64(b
, src
[0], src
[1]);
307 return lower_umod64(b
, src
[0], src
[1]);
309 return lower_imod64(b
, src
[0], src
[1]);
311 return lower_irem64(b
, src
[0], src
[1]);
313 unreachable("Invalid ALU opcode to lower");
318 lower_int64_impl(nir_function_impl
*impl
, nir_lower_int64_options options
)
321 nir_builder_init(&b
, impl
);
323 bool progress
= false;
324 nir_foreach_block(block
, impl
) {
325 nir_foreach_instr_safe(instr
, block
) {
326 if (instr
->type
!= nir_instr_type_alu
)
329 nir_alu_instr
*alu
= nir_instr_as_alu(instr
);
330 assert(alu
->dest
.dest
.is_ssa
);
331 if (alu
->dest
.dest
.ssa
.bit_size
!= 64)
334 if (!(options
& opcode_to_options_mask(alu
->op
)))
337 b
.cursor
= nir_before_instr(instr
);
339 nir_ssa_def
*lowered
= lower_int64_alu_instr(&b
, alu
);
340 nir_ssa_def_rewrite_uses(&alu
->dest
.dest
.ssa
,
341 nir_src_for_ssa(lowered
));
342 nir_instr_remove(&alu
->instr
);
348 nir_metadata_preserve(impl
, nir_metadata_none
);
354 nir_lower_int64(nir_shader
*shader
, nir_lower_int64_options options
)
356 bool progress
= false;
358 nir_foreach_function(function
, shader
) {
360 progress
|= lower_int64_impl(function
->impl
, options
);