src/util/fast_idiv_by_const.h

   1 /*
   2  * Copyright © 2018 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #ifndef FAST_IDIV_BY_CONST_H
  25 #define FAST_IDIV_BY_CONST_H
  26
  27 /* Imported from:
  28  *   https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
  29  */
  30
  31 #include <inttypes.h>
  32 #include <limits.h>
  33 #include <assert.h>
  34
  35 #ifdef __cplusplus
  36 extern "C" {
  37 #endif
  38
  39 /* Computes "magic info" for performing signed division by a fixed integer D.
  40  * The type 'sint_t' is assumed to be defined as a signed integer type large
  41  * enough to hold both the dividend and the divisor.
  42  * Here >> is arithmetic (signed) shift, and >>> is logical shift.
  43  *
  44  * To emit code for n/d, rounding towards zero, use the following sequence:
  45  *
  46  *   m = compute_signed_magic_info(D)
  47  *   emit("result = (m.multiplier * n) >> SINT_BITS");
  48  *   if d > 0 and m.multiplier < 0: emit("result += n")
  49  *   if d < 0 and m.multiplier > 0: emit("result -= n")
  50  *   if m.post_shift > 0: emit("result >>= m.shift")
  51  *   emit("result += (result < 0)")
  52  *
  53  * The shifts by SINT_BITS may be "free" if the high half of the full multiply
  54  * is put in a separate register.
  55  *
  56  * The final add can of course be implemented via the sign bit, e.g.
  57  *    result += (result >>> (SINT_BITS - 1))
  58  * or
  59  *    result -= (result >> (SINT_BITS - 1))
  60  *
  61  * This code is heavily indebted to Hacker's Delight by Henry Warren.
  62  * See http://www.hackersdelight.org/HDcode/magic.c.txt
  63  * Used with permission from http://www.hackersdelight.org/permissions.htm
  64  */
  65
  66 struct util_fast_sdiv_info {
  67    int64_t multiplier; /* the "magic number" multiplier */
  68    unsigned shift; /* shift for the dividend after multiplying */
  69 };
  70
  71 struct util_fast_sdiv_info
  72 util_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);
  73
  74 /* Computes "magic info" for performing unsigned division by a fixed positive
  75  * integer D.  UINT_BITS is the bit size at which the final "magic"
  76  * calculation will be performed; it is assumed to be large enough to hold
  77  * both the dividand and the divisor.  num_bits can be set appropriately if n
  78  * is known to be smaller than calc_bits; if this is not known then UINT_BITS
  79  * for num_bits.
  80  *
  81  * Assume we have a hardware register of width UINT_BITS, a known constant D
  82  * which is not zero and not a power of 2, and a variable n of width num_bits
  83  * (which may be up to UINT_BITS). To emit code for n/d, use one of the two
  84  * following sequences (here >>> refers to a logical bitshift):
  85  *
  86  *   m = compute_unsigned_magic_info(D, num_bits)
  87  *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
  88  *   if m.increment: emit("n = saturated_increment(n)")
  89  *   emit("result = (m.multiplier * n) >>> UINT_BITS")
  90  *   if m.post_shift > 0: emit("result >>>= m.post_shift")
  91  *
  92  * or
  93  *
  94  *   m = compute_unsigned_magic_info(D, num_bits)
  95  *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
  96  *   emit("result = m.multiplier * n")
  97  *   if m.increment: emit("result = result + m.multiplier")
  98  *   emit("result >>>= UINT_BITS")
  99  *   if m.post_shift > 0: emit("result >>>= m.post_shift")
 100  *
 101  * This second version works even if D is 1.  The shifts by UINT_BITS may be
 102  * "free" if the high half of the full multiply is put in a separate register.
 103  *
 104  * saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
 105  *   if n == (1 << UINT_BITS)-1: result = n
 106  *   else: result = n+1
 107  * A common way to implement this is with the carry bit. For example, on x86:
 108  *   add 1
 109  *   sbb 0
 110  *
 111  * Some invariants:
 112  *   1: At least one of pre_shift and increment is zero
 113  *   2: multiplier is never zero
 114  *
 115  * This code incorporates the "round down" optimization per ridiculous_fish.
 116  */
 117
 118 struct util_fast_udiv_info {
 119    uint64_t multiplier; /* the "magic number" multiplier */
 120    unsigned pre_shift; /* shift for the dividend before multiplying */
 121    unsigned post_shift; /* shift for the dividend after multiplying */
 122    int increment; /* 0 or 1; if set then increment the numerator, using one of
 123                      the two strategies */
 124 };
 125
 126 struct util_fast_udiv_info
 127 util_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);
 128
 129 /* Below are possible options for dividing by a uniform in a shader where
 130  * the divisor is constant but not known at compile time.
 131  */
 132
 133 /* Full version. */
 134 static inline uint32_t
 135 util_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)
 136 {
 137    n = n >> info.pre_shift;
 138    /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
 139     * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
 140     *
 141     * If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
 142     *    increment = increment ? multiplier : 0; // on the CPU
 143     *    (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
 144     */
 145    n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;
 146    n = n >> info.post_shift;
 147    return n;
 148 }
 149
 150 /* A little more efficient version if n != UINT_MAX, i.e. no unsigned
 151  * wraparound in the computation.
 152  */
 153 static inline uint32_t
 154 util_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)
 155 {
 156    assert(n != UINT32_MAX);
 157    n = n >> info.pre_shift;
 158    n = n + info.increment;
 159    n = ((uint64_t)n * info.multiplier) >> 32;
 160    n = n >> info.post_shift;
 161    return n;
 162 }
 163
 164 /* Even faster version but both operands must be 31-bit unsigned integers
 165  * and the divisor must be greater than 1.
 166  *
 167  * info must be computed with num_bits == 31.
 168  */
 169 static inline uint32_t
 170 util_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)
 171 {
 172    assert(info.pre_shift == 0);
 173    assert(info.increment == 0);
 174    n = ((uint64_t)n * info.multiplier) >> 32;
 175    n = n >> info.post_shift;
 176    return n;
 177 }
 178
 179 #ifdef __cplusplus
 180 } /* extern C */
 181 #endif
 182
 183 #endif /* FAST_IDIV_BY_CONST_H */