2 * Copyright © 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
27 * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
29 * The author, ridiculous_fish, wrote:
31 * ''Reference implementations of computing and using the "magic number"
32 * approach to dividing by constants, including codegen instructions.
33 * The unsigned division incorporates the "round down" optimization per
36 * This is free and unencumbered software. Any copyright is dedicated
37 * to the Public Domain.''
40 #include "fast_idiv_by_const.h"
45 /* uint_t and sint_t can be replaced by different integer types and the code
46 * will work as-is. The only requirement is that sizeof(uintN) == sizeof(intN).
49 struct util_fast_udiv_info
50 util_compute_fast_udiv_info(uint_t D
, unsigned num_bits
)
52 /* The numerator must fit in a uint_t */
53 assert(num_bits
> 0 && num_bits
<= sizeof(uint_t
) * CHAR_BIT
);
56 /* The eventual result */
57 struct util_fast_udiv_info result
;
59 /* Bits in a uint_t */
60 const unsigned UINT_BITS
= sizeof(uint_t
) * CHAR_BIT
;
62 /* The extra shift implicit in the difference between UINT_BITS and num_bits
64 const unsigned extra_shift
= UINT_BITS
- num_bits
;
66 /* The initial power of 2 is one less than the first one that can possibly
69 const uint_t initial_power_of_2
= (uint_t
)1 << (UINT_BITS
-1);
71 /* The remainder and quotient of our power of 2 divided by d */
72 uint_t quotient
= initial_power_of_2
/ D
;
73 uint_t remainder
= initial_power_of_2
% D
;
76 unsigned ceil_log_2_D
;
78 /* The magic info for the variant "round down" algorithm */
79 uint_t down_multiplier
= 0;
80 unsigned down_exponent
= 0;
81 int has_magic_down
= 0;
83 /* Compute ceil(log_2 D) */
86 for (tmp
= D
; tmp
> 0; tmp
>>= 1)
90 /* Begin a loop that increments the exponent, until we find a power of 2
94 for (exponent
= 0; ; exponent
++) {
95 /* Quotient and remainder is from previous exponent; compute it for this
98 if (remainder
>= D
- remainder
) {
99 /* Doubling remainder will wrap around D */
100 quotient
= quotient
* 2 + 1;
101 remainder
= remainder
* 2 - D
;
103 /* Remainder will not wrap */
104 quotient
= quotient
* 2;
105 remainder
= remainder
* 2;
108 /* We're done if this exponent works for the round_up algorithm.
109 * Note that exponent may be larger than the maximum shift supported,
110 * so the check for >= ceil_log_2_D is critical.
112 if ((exponent
+ extra_shift
>= ceil_log_2_D
) ||
113 (D
- remainder
) <= ((uint_t
)1 << (exponent
+ extra_shift
)))
116 /* Set magic_down if we have not set it yet and this exponent works for
117 * the round_down algorithm
119 if (!has_magic_down
&&
120 remainder
<= ((uint_t
)1 << (exponent
+ extra_shift
))) {
122 down_multiplier
= quotient
;
123 down_exponent
= exponent
;
127 if (exponent
< ceil_log_2_D
) {
128 /* magic_up is efficient */
129 result
.multiplier
= quotient
+ 1;
130 result
.pre_shift
= 0;
131 result
.post_shift
= exponent
;
132 result
.increment
= 0;
134 /* Odd divisor, so use magic_down, which must have been set */
135 assert(has_magic_down
);
136 result
.multiplier
= down_multiplier
;
137 result
.pre_shift
= 0;
138 result
.post_shift
= down_exponent
;
139 result
.increment
= 1;
141 /* Even divisor, so use a prefix-shifted dividend */
142 unsigned pre_shift
= 0;
143 uint_t shifted_D
= D
;
144 while ((shifted_D
& 1) == 0) {
148 result
= util_compute_fast_udiv_info(shifted_D
, num_bits
- pre_shift
);
149 /* expect no increment or pre_shift in this path */
150 assert(result
.increment
== 0 && result
.pre_shift
== 0);
151 result
.pre_shift
= pre_shift
;
156 struct util_fast_sdiv_info
157 util_compute_fast_sdiv_info(sint_t D
)
159 /* D must not be zero. */
161 /* The result is not correct for these divisors. */
162 assert(D
!= 1 && D
!= -1);
165 struct util_fast_sdiv_info result
;
167 /* Bits in an sint_t */
168 const unsigned SINT_BITS
= sizeof(sint_t
) * CHAR_BIT
;
170 /* Absolute value of D (we know D is not the most negative value since
171 * that's a power of 2)
173 const uint_t abs_d
= (D
< 0 ? -D
: D
);
175 /* The initial power of 2 is one less than the first one that can possibly
177 /* "two31" in Warren */
178 unsigned exponent
= SINT_BITS
- 1;
179 const uint_t initial_power_of_2
= (uint_t
)1 << exponent
;
181 /* Compute the absolute value of our "test numerator,"
182 * which is the largest dividend whose remainder with d is d-1.
183 * This is called anc in Warren.
185 const uint_t tmp
= initial_power_of_2
+ (D
< 0);
186 const uint_t abs_test_numer
= tmp
- 1 - tmp
% abs_d
;
188 /* Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) */
189 uint_t quotient1
= initial_power_of_2
/ abs_test_numer
;
190 uint_t remainder1
= initial_power_of_2
% abs_test_numer
;
191 uint_t quotient2
= initial_power_of_2
/ abs_d
;
192 uint_t remainder2
= initial_power_of_2
% abs_d
;
197 /* Update the exponent */
200 /* Update quotient1 and remainder1 */
203 if (remainder1
>= abs_test_numer
) {
205 remainder1
-= abs_test_numer
;
208 /* Update quotient2 and remainder2 */
211 if (remainder2
>= abs_d
) {
216 /* Keep going as long as (2**exponent) / abs_d <= delta */
217 delta
= abs_d
- remainder2
;
218 } while (quotient1
< delta
|| (quotient1
== delta
&& remainder1
== 0));
220 result
.multiplier
= quotient2
+ 1;
221 if (D
< 0) result
.multiplier
= -result
.multiplier
;
222 result
.shift
= exponent
- SINT_BITS
;