2 * Copyright © 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
27 * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
29 * The author, ridiculous_fish, wrote:
31 * ''Reference implementations of computing and using the "magic number"
32 * approach to dividing by constants, including codegen instructions.
33 * The unsigned division incorporates the "round down" optimization per
36 * This is free and unencumbered software. Any copyright is dedicated
37 * to the Public Domain.''
40 #include "fast_idiv_by_const.h"
45 struct util_fast_udiv_info
46 util_compute_fast_udiv_info(uint64_t D
, unsigned num_bits
, unsigned UINT_BITS
)
48 /* The numerator must fit in a uint64_t */
49 assert(num_bits
> 0 && num_bits
<= UINT_BITS
);
52 /* The eventual result */
53 struct util_fast_udiv_info result
;
56 /* The extra shift implicit in the difference between UINT_BITS and num_bits
58 const unsigned extra_shift
= UINT_BITS
- num_bits
;
60 /* The initial power of 2 is one less than the first one that can possibly
63 const uint64_t initial_power_of_2
= (uint64_t)1 << (UINT_BITS
-1);
65 /* The remainder and quotient of our power of 2 divided by d */
66 uint64_t quotient
= initial_power_of_2
/ D
;
67 uint64_t remainder
= initial_power_of_2
% D
;
70 unsigned ceil_log_2_D
;
72 /* The magic info for the variant "round down" algorithm */
73 uint64_t down_multiplier
= 0;
74 unsigned down_exponent
= 0;
75 int has_magic_down
= 0;
77 /* Compute ceil(log_2 D) */
80 for (tmp
= D
; tmp
> 0; tmp
>>= 1)
84 /* Begin a loop that increments the exponent, until we find a power of 2
88 for (exponent
= 0; ; exponent
++) {
89 /* Quotient and remainder is from previous exponent; compute it for this
92 if (remainder
>= D
- remainder
) {
93 /* Doubling remainder will wrap around D */
94 quotient
= quotient
* 2 + 1;
95 remainder
= remainder
* 2 - D
;
97 /* Remainder will not wrap */
98 quotient
= quotient
* 2;
99 remainder
= remainder
* 2;
102 /* We're done if this exponent works for the round_up algorithm.
103 * Note that exponent may be larger than the maximum shift supported,
104 * so the check for >= ceil_log_2_D is critical.
106 if ((exponent
+ extra_shift
>= ceil_log_2_D
) ||
107 (D
- remainder
) <= ((uint64_t)1 << (exponent
+ extra_shift
)))
110 /* Set magic_down if we have not set it yet and this exponent works for
111 * the round_down algorithm
113 if (!has_magic_down
&&
114 remainder
<= ((uint64_t)1 << (exponent
+ extra_shift
))) {
116 down_multiplier
= quotient
;
117 down_exponent
= exponent
;
121 if (exponent
< ceil_log_2_D
) {
122 /* magic_up is efficient */
123 result
.multiplier
= quotient
+ 1;
124 result
.pre_shift
= 0;
125 result
.post_shift
= exponent
;
126 result
.increment
= 0;
128 /* Odd divisor, so use magic_down, which must have been set */
129 assert(has_magic_down
);
130 result
.multiplier
= down_multiplier
;
131 result
.pre_shift
= 0;
132 result
.post_shift
= down_exponent
;
133 result
.increment
= 1;
135 /* Even divisor, so use a prefix-shifted dividend */
136 unsigned pre_shift
= 0;
137 uint64_t shifted_D
= D
;
138 while ((shifted_D
& 1) == 0) {
142 result
= util_compute_fast_udiv_info(shifted_D
, num_bits
- pre_shift
,
144 /* expect no increment or pre_shift in this path */
145 assert(result
.increment
== 0 && result
.pre_shift
== 0);
146 result
.pre_shift
= pre_shift
;
151 static inline int64_t
152 sign_extend(int64_t x
, unsigned SINT_BITS
)
154 return (x
<< (64 - SINT_BITS
)) >> (64 - SINT_BITS
);
157 struct util_fast_sdiv_info
158 util_compute_fast_sdiv_info(int64_t D
, unsigned SINT_BITS
)
160 /* D must not be zero. */
162 /* The result is not correct for these divisors. */
163 assert(D
!= 1 && D
!= -1);
166 struct util_fast_sdiv_info result
;
168 /* Absolute value of D (we know D is not the most negative value since
169 * that's a power of 2)
171 const uint64_t abs_d
= (D
< 0 ? -D
: D
);
173 /* The initial power of 2 is one less than the first one that can possibly
175 /* "two31" in Warren */
176 unsigned exponent
= SINT_BITS
- 1;
177 const uint64_t initial_power_of_2
= (uint64_t)1 << exponent
;
179 /* Compute the absolute value of our "test numerator,"
180 * which is the largest dividend whose remainder with d is d-1.
181 * This is called anc in Warren.
183 const uint64_t tmp
= initial_power_of_2
+ (D
< 0);
184 const uint64_t abs_test_numer
= tmp
- 1 - tmp
% abs_d
;
186 /* Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) */
187 uint64_t quotient1
= initial_power_of_2
/ abs_test_numer
;
188 uint64_t remainder1
= initial_power_of_2
% abs_test_numer
;
189 uint64_t quotient2
= initial_power_of_2
/ abs_d
;
190 uint64_t remainder2
= initial_power_of_2
% abs_d
;
195 /* Update the exponent */
198 /* Update quotient1 and remainder1 */
201 if (remainder1
>= abs_test_numer
) {
203 remainder1
-= abs_test_numer
;
206 /* Update quotient2 and remainder2 */
209 if (remainder2
>= abs_d
) {
214 /* Keep going as long as (2**exponent) / abs_d <= delta */
215 delta
= abs_d
- remainder2
;
216 } while (quotient1
< delta
|| (quotient1
== delta
&& remainder1
== 0));
218 result
.multiplier
= sign_extend(quotient2
+ 1, SINT_BITS
);
219 if (D
< 0) result
.multiplier
= -result
.multiplier
;
220 result
.shift
= exponent
- SINT_BITS
;