2 * Copyright © 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #ifndef FAST_IDIV_BY_CONST_H
25 #define FAST_IDIV_BY_CONST_H
28 * https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
39 /* Computes "magic info" for performing signed division by a fixed integer D.
40 * The type 'sint_t' is assumed to be defined as a signed integer type large
41 * enough to hold both the dividend and the divisor.
42 * Here >> is arithmetic (signed) shift, and >>> is logical shift.
44 * To emit code for n/d, rounding towards zero, use the following sequence:
46 * m = compute_signed_magic_info(D)
47 * emit("result = (m.multiplier * n) >> SINT_BITS");
48 * if d > 0 and m.multiplier < 0: emit("result += n")
49 * if d < 0 and m.multiplier > 0: emit("result -= n")
50 * if m.post_shift > 0: emit("result >>= m.shift")
51 * emit("result += (result < 0)")
53 * The shifts by SINT_BITS may be "free" if the high half of the full multiply
54 * is put in a separate register.
56 * The final add can of course be implemented via the sign bit, e.g.
57 * result += (result >>> (SINT_BITS - 1))
59 * result -= (result >> (SINT_BITS - 1))
61 * This code is heavily indebted to Hacker's Delight by Henry Warren.
62 * See http://www.hackersdelight.org/HDcode/magic.c.txt
63 * Used with permission from http://www.hackersdelight.org/permissions.htm
66 struct util_fast_sdiv_info
{
67 int64_t multiplier
; /* the "magic number" multiplier */
68 unsigned shift
; /* shift for the dividend after multiplying */
71 struct util_fast_sdiv_info
72 util_compute_fast_sdiv_info(int64_t D
, unsigned SINT_BITS
);
74 /* Computes "magic info" for performing unsigned division by a fixed positive
75 * integer D. UINT_BITS is the bit size at which the final "magic"
76 * calculation will be performed; it is assumed to be large enough to hold
77 * both the dividand and the divisor. num_bits can be set appropriately if n
78 * is known to be smaller than calc_bits; if this is not known then UINT_BITS
81 * Assume we have a hardware register of width UINT_BITS, a known constant D
82 * which is not zero and not a power of 2, and a variable n of width num_bits
83 * (which may be up to UINT_BITS). To emit code for n/d, use one of the two
84 * following sequences (here >>> refers to a logical bitshift):
86 * m = compute_unsigned_magic_info(D, num_bits)
87 * if m.pre_shift > 0: emit("n >>>= m.pre_shift")
88 * if m.increment: emit("n = saturated_increment(n)")
89 * emit("result = (m.multiplier * n) >>> UINT_BITS")
90 * if m.post_shift > 0: emit("result >>>= m.post_shift")
94 * m = compute_unsigned_magic_info(D, num_bits)
95 * if m.pre_shift > 0: emit("n >>>= m.pre_shift")
96 * emit("result = m.multiplier * n")
97 * if m.increment: emit("result = result + m.multiplier")
98 * emit("result >>>= UINT_BITS")
99 * if m.post_shift > 0: emit("result >>>= m.post_shift")
101 * This second version works even if D is 1. The shifts by UINT_BITS may be
102 * "free" if the high half of the full multiply is put in a separate register.
104 * saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
105 * if n == (1 << UINT_BITS)-1: result = n
107 * A common way to implement this is with the carry bit. For example, on x86:
112 * 1: At least one of pre_shift and increment is zero
113 * 2: multiplier is never zero
115 * This code incorporates the "round down" optimization per ridiculous_fish.
118 struct util_fast_udiv_info
{
119 uint64_t multiplier
; /* the "magic number" multiplier */
120 unsigned pre_shift
; /* shift for the dividend before multiplying */
121 unsigned post_shift
; /* shift for the dividend after multiplying */
122 int increment
; /* 0 or 1; if set then increment the numerator, using one of
123 the two strategies */
126 struct util_fast_udiv_info
127 util_compute_fast_udiv_info(uint64_t D
, unsigned num_bits
, unsigned UINT_BITS
);
129 /* Below are possible options for dividing by a uniform in a shader where
130 * the divisor is constant but not known at compile time.
134 static inline uint32_t
135 util_fast_udiv32(uint32_t n
, struct util_fast_udiv_info info
)
137 n
= n
>> info
.pre_shift
;
138 /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
139 * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
141 * If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
142 * increment = increment ? multiplier : 0; // on the CPU
143 * (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
145 n
= (((uint64_t)n
+ info
.increment
) * info
.multiplier
) >> 32;
146 n
= n
>> info
.post_shift
;
150 /* A little more efficient version if n != UINT_MAX, i.e. no unsigned
151 * wraparound in the computation.
153 static inline uint32_t
154 util_fast_udiv32_nuw(uint32_t n
, struct util_fast_udiv_info info
)
156 assert(n
!= UINT32_MAX
);
157 n
= n
>> info
.pre_shift
;
158 n
= n
+ info
.increment
;
159 n
= ((uint64_t)n
* info
.multiplier
) >> 32;
160 n
= n
>> info
.post_shift
;
164 /* Even faster version but both operands must be 31-bit unsigned integers
165 * and the divisor must be greater than 1.
167 * info must be computed with num_bits == 31.
169 static inline uint32_t
170 util_fast_udiv32_u31_d_not_one(uint32_t n
, struct util_fast_udiv_info info
)
172 assert(info
.pre_shift
== 0);
173 assert(info
.increment
== 0);
174 n
= ((uint64_t)n
* info
.multiplier
) >> 32;
175 n
= n
>> info
.post_shift
;
183 #endif /* FAST_IDIV_BY_CONST_H */