2 * Copyright © 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
27 * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
29 * The author, ridiculous_fish, wrote:
31 * ''Reference implementations of computing and using the "magic number"
32 * approach to dividing by constants, including codegen instructions.
33 * The unsigned division incorporates the "round down" optimization per
36 * This is free and unencumbered software. Any copyright is dedicated
37 * to the Public Domain.''
40 #include "fast_idiv_by_const.h"
45 struct util_fast_udiv_info
46 util_compute_fast_udiv_info(uint64_t D
, unsigned num_bits
, unsigned UINT_BITS
)
48 /* The numerator must fit in a uint64_t */
49 assert(num_bits
> 0 && num_bits
<= UINT_BITS
);
52 /* The eventual result */
53 struct util_fast_udiv_info result
;
55 if (util_is_power_of_two_or_zero64(D
)) {
56 unsigned div_shift
= util_logbase2_64(D
);
59 /* Dividing by a power of two. */
60 result
.multiplier
= 1ull << (UINT_BITS
- div_shift
);
62 result
.post_shift
= 0;
67 /* Assuming: floor((num + 1) * (2^32 - 1) / 2^32) = num */
68 result
.multiplier
= UINT_BITS
== 64 ? UINT64_MAX
:
69 (1ull << UINT_BITS
) - 1;
71 result
.post_shift
= 0;
77 /* The extra shift implicit in the difference between UINT_BITS and num_bits
79 const unsigned extra_shift
= UINT_BITS
- num_bits
;
81 /* The initial power of 2 is one less than the first one that can possibly
84 const uint64_t initial_power_of_2
= (uint64_t)1 << (UINT_BITS
-1);
86 /* The remainder and quotient of our power of 2 divided by d */
87 uint64_t quotient
= initial_power_of_2
/ D
;
88 uint64_t remainder
= initial_power_of_2
% D
;
91 unsigned ceil_log_2_D
;
93 /* The magic info for the variant "round down" algorithm */
94 uint64_t down_multiplier
= 0;
95 unsigned down_exponent
= 0;
96 int has_magic_down
= 0;
98 /* Compute ceil(log_2 D) */
101 for (tmp
= D
; tmp
> 0; tmp
>>= 1)
105 /* Begin a loop that increments the exponent, until we find a power of 2
109 for (exponent
= 0; ; exponent
++) {
110 /* Quotient and remainder is from previous exponent; compute it for this
113 if (remainder
>= D
- remainder
) {
114 /* Doubling remainder will wrap around D */
115 quotient
= quotient
* 2 + 1;
116 remainder
= remainder
* 2 - D
;
118 /* Remainder will not wrap */
119 quotient
= quotient
* 2;
120 remainder
= remainder
* 2;
123 /* We're done if this exponent works for the round_up algorithm.
124 * Note that exponent may be larger than the maximum shift supported,
125 * so the check for >= ceil_log_2_D is critical.
127 if ((exponent
+ extra_shift
>= ceil_log_2_D
) ||
128 (D
- remainder
) <= ((uint64_t)1 << (exponent
+ extra_shift
)))
131 /* Set magic_down if we have not set it yet and this exponent works for
132 * the round_down algorithm
134 if (!has_magic_down
&&
135 remainder
<= ((uint64_t)1 << (exponent
+ extra_shift
))) {
137 down_multiplier
= quotient
;
138 down_exponent
= exponent
;
142 if (exponent
< ceil_log_2_D
) {
143 /* magic_up is efficient */
144 result
.multiplier
= quotient
+ 1;
145 result
.pre_shift
= 0;
146 result
.post_shift
= exponent
;
147 result
.increment
= 0;
149 /* Odd divisor, so use magic_down, which must have been set */
150 assert(has_magic_down
);
151 result
.multiplier
= down_multiplier
;
152 result
.pre_shift
= 0;
153 result
.post_shift
= down_exponent
;
154 result
.increment
= 1;
156 /* Even divisor, so use a prefix-shifted dividend */
157 unsigned pre_shift
= 0;
158 uint64_t shifted_D
= D
;
159 while ((shifted_D
& 1) == 0) {
163 result
= util_compute_fast_udiv_info(shifted_D
, num_bits
- pre_shift
,
165 /* expect no increment or pre_shift in this path */
166 assert(result
.increment
== 0 && result
.pre_shift
== 0);
167 result
.pre_shift
= pre_shift
;
172 static inline int64_t
173 sign_extend(int64_t x
, unsigned SINT_BITS
)
175 return (x
<< (64 - SINT_BITS
)) >> (64 - SINT_BITS
);
178 struct util_fast_sdiv_info
179 util_compute_fast_sdiv_info(int64_t D
, unsigned SINT_BITS
)
181 /* D must not be zero. */
183 /* The result is not correct for these divisors. */
184 assert(D
!= 1 && D
!= -1);
187 struct util_fast_sdiv_info result
;
189 /* Absolute value of D (we know D is not the most negative value since
190 * that's a power of 2)
192 const uint64_t abs_d
= (D
< 0 ? -D
: D
);
194 /* The initial power of 2 is one less than the first one that can possibly
196 /* "two31" in Warren */
197 unsigned exponent
= SINT_BITS
- 1;
198 const uint64_t initial_power_of_2
= (uint64_t)1 << exponent
;
200 /* Compute the absolute value of our "test numerator,"
201 * which is the largest dividend whose remainder with d is d-1.
202 * This is called anc in Warren.
204 const uint64_t tmp
= initial_power_of_2
+ (D
< 0);
205 const uint64_t abs_test_numer
= tmp
- 1 - tmp
% abs_d
;
207 /* Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) */
208 uint64_t quotient1
= initial_power_of_2
/ abs_test_numer
;
209 uint64_t remainder1
= initial_power_of_2
% abs_test_numer
;
210 uint64_t quotient2
= initial_power_of_2
/ abs_d
;
211 uint64_t remainder2
= initial_power_of_2
% abs_d
;
216 /* Update the exponent */
219 /* Update quotient1 and remainder1 */
222 if (remainder1
>= abs_test_numer
) {
224 remainder1
-= abs_test_numer
;
227 /* Update quotient2 and remainder2 */
230 if (remainder2
>= abs_d
) {
235 /* Keep going as long as (2**exponent) / abs_d <= delta */
236 delta
= abs_d
- remainder2
;
237 } while (quotient1
< delta
|| (quotient1
== delta
&& remainder1
== 0));
239 result
.multiplier
= sign_extend(quotient2
+ 1, SINT_BITS
);
240 if (D
< 0) result
.multiplier
= -result
.multiplier
;
241 result
.shift
= exponent
- SINT_BITS
;