openpower/sv/bitmanip/divmnu64.c

   1 /* original source code from Hackers-Delight
   2    https://github.com/hcs0/Hackers-Delight
   3 */
   4 /* This divides an n-word dividend by an m-word divisor, giving an
   5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
   6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
   7 machine which has a 64/64 division instruction. */
   8
   9 #include <stdio.h>
  10 #include <stdlib.h>     //To define "exit", req'd by XLC.
  11 #include <stdbool.h>
  12 #include <stdint.h>
  13
  14 #define max(x, y) ((x) > (y) ? (x) : (y))
  15
  16 int nlz(unsigned x) {
  17    int n;
  18
  19    if (x == 0) return(32);
  20    n = 0;
  21    if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
  22    if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
  23    if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
  24    if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
  25    if (x <= 0x7FFFFFFF) {n = n + 1;}
  26    return n;
  27 }
  28
  29 void dumpit(char *msg, int n, unsigned v[]) {
  30    int i;
  31    printf("%s", msg);
  32    for (i = n-1; i >= 0; i--) printf(" %08x", v[i]);
  33    printf("\n");
  34 }
  35
  36 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
  37 (The sequence is in little-endian order).
  38
  39 This is a fairly precise implementation of Knuth's Algorithm D, for a
  40 binary computer with base b = 2**32. The caller supplies:
  41    1. Space q for the quotient, m - n + 1 words (at least one).
  42    2. Space r for the remainder (optional), n words.
  43    3. The dividend u, m words, m >= 1.
  44    4. The divisor v, n words, n >= 2.
  45 The most significant digit of the divisor, v[n-1], must be nonzero.  The
  46 dividend u may have leading zeros; this just makes the algorithm take
  47 longer and makes the quotient contain more leading zeros.  A value of
  48 NULL may be given for the address of the remainder to signify that the
  49 caller does not want the remainder.
  50    The program does not alter the input parameters u and v.
  51    The quotient and remainder returned may have leading zeros.  The
  52 function itself returns a value of 0 for success and 1 for invalid
  53 parameters (e.g., division by 0).
  54    For now, we must have m >= n.  Knuth's Algorithm D also requires
  55 that the dividend be at least as long as the divisor.  (In his terms,
  56 m >= 0 (unstated).  Therefore m+n >= n.) */
  57
  58 int divmnu(unsigned q[], unsigned r[],
  59      const unsigned u[], const unsigned v[],
  60      int m, int n) {
  61
  62    const unsigned long long b = 4294967296LL; // Number base (2**32).
  63    unsigned *un, *vn;                         // Normalized form of u, v.
  64    unsigned long long qhat;                   // Estimated quotient digit.
  65    unsigned long long rhat;                   // A remainder.
  66    unsigned long long p;                      // Product of two digits.
  67    long long t, k;
  68    int s, i, j;
  69
  70    if (m < n || n <= 0 || v[n-1] == 0)
  71       return 1;                         // Return if invalid param.
  72
  73    if (n == 1) {                        // Take care of
  74       k = 0;                            // the case of a
  75       for (j = m - 1; j >= 0; j--) {    // single-digit
  76          q[j] = (k*b + u[j])/v[0];      // divisor here.
  77          k = (k*b + u[j]) - q[j]*v[0];
  78       }
  79       if (r != NULL) r[0] = k;
  80       return 0;
  81    }
  82
  83    /* Normalize by shifting v left just enough so that its high-order
  84    bit is on, and shift u left the same amount. We may have to append a
  85    high-order digit on the dividend; we do that unconditionally. */
  86
  87    s = nlz(v[n-1]);             // 0 <= s <= 31.
  88    vn = (unsigned *)alloca(4*n);
  89    for (i = n - 1; i > 0; i--)
  90       vn[i] = (v[i] << s) | ((unsigned long long)v[i-1] >> (32-s));
  91    vn[0] = v[0] << s;
  92
  93    un = (unsigned *)alloca(4*(m + 1));
  94    un[m] = (unsigned long long)u[m-1] >> (32-s);
  95    for (i = m - 1; i > 0; i--)
  96       un[i] = (u[i] << s) | ((unsigned long long)u[i-1] >> (32-s));
  97    un[0] = u[0] << s;
  98
  99    for (j = m - n; j >= 0; j--) {       // Main loop.
 100       // Compute estimate qhat of q[j].
 101       qhat = (un[j+n]*b + un[j+n-1])/vn[n-1];
 102       rhat = (un[j+n]*b + un[j+n-1]) - qhat*vn[n-1];
 103 again:
 104       if (qhat >= b || qhat*vn[n-2] > b*rhat + un[j+n-2])
 105       { qhat = qhat - 1;
 106         rhat = rhat + vn[n-1];
 107         if (rhat < b) goto again;
 108       }
 109
 110 #define SUB_MUL_BORROW
 111 #ifdef ORIGINAL
 112       // Multiply and subtract.
 113       k = 0;
 114       for (i = 0; i < n; i++) {
 115          p = qhat*vn[i];
 116          t = un[i+j] - k - (p & 0xFFFFFFFFLL);
 117          un[i+j] = t;
 118          k = (p >> 32) - (t >> 32);
 119       }
 120       t = un[j+n] - k;
 121       un[j+n] = t;
 122       bool need_fixup = t < 0;
 123 #elif defined(SUB_MUL_BORROW)
 124       (void)p; // shut up unused variable warning
 125
 126       // Multiply and subtract.
 127       uint32_t borrow = 0;
 128       uint32_t phi[2000]; // plenty space
 129       uint32_t plo[2000]; // plenty space
 130       // first, perform mul-and-sub and store in split hi-lo
 131       // this shows the vectorised sv.msubx which stores 128-bit in
 132       // two 64-bit registers
 133       for(int i = 0; i <= n; i++) {
 134          uint32_t vn_i = i < n ? vn[i] : 0;
 135          uint64_t value = un[i + j] - (uint64_t)qhat * vn_i;
 136          plo[i] = value & 0xffffffffLL;
 137          phi[i] = value >> 32;
 138       }
 139       // second, reconstruct the 64-bit result, subtract borrow,
 140       // store top-half (-ve) in new borrow and store low-half as answer
 141       // this is the new (odd) instruction
 142       for(int i = 0; i <= n; i++) {
 143          uint64_t value = (((uint64_t)phi[i]<<32) | plo[i]) - borrow;
 144          borrow = ~(value >> 32)+1; // -(uint32_t)(value >> 32);
 145          un[i + j] = (uint32_t)value;
 146       }
 147       bool need_fixup = borrow != 0;
 148 #elif defined(MUL_RSUB_CARRY)
 149       (void)p; // shut up unused variable warning
 150
 151       // Multiply and subtract.
 152       uint32_t carry = 1;
 153       uint32_t phi[2000]; // plenty space
 154       uint32_t plo[2000]; // plenty space
 155       for(int i = 0; i <= n; i++) {
 156          uint32_t vn_i = i < n ? vn[i] : 0;
 157          uint64_t value = un[i + j] + ~((uint64_t)qhat * vn_i);
 158          plo[i] = value & 0xffffffffLL;
 159          phi[i] = value >> 32;
 160       }
 161       for(int i = 0; i <= n; i++) {
 162          uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry;
 163          uint32_t result_high = result >> 32;
 164          if(carry <= 1)
 165             result_high++;
 166          carry = result_high;
 167          un[i + j] = (uint32_t)result;
 168       }
 169       bool need_fixup = carry != 1;
 170 #elif defined(MUL_RSUB_CARRY1)
 171       (void)p; // shut up unused variable warning
 172
 173       // Multiply and subtract.
 174       uint32_t carry = 1;
 175       uint32_t phi[2000]; // plenty space
 176       uint32_t plo[2000]; // plenty space
 177       // same mul-and-sub as SUB_MUL_BORROW but not the same
 178       // mul-and-sub-minus-one as MUL_RSUB_CARRY
 179       for(int i = 0; i <= n; i++) {
 180          uint32_t vn_i = i < n ? vn[i] : 0;
 181          uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
 182          plo[i] = value & 0xffffffffLL;
 183          phi[i] = value >> 32;
 184       }
 185       // compensate for the +1 that was added by mul-and-sub by subtracting
 186       // it here (as ~(0))
 187       for(int i = 0; i <= n; i++) {
 188          uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry+
 189                            ~(0); // a way to express "-1"
 190          uint32_t result_high = result >> 32;
 191          if(carry <= 1)
 192             result_high++;
 193          carry = result_high;
 194          un[i + j] = (uint32_t)result;
 195       }
 196       bool need_fixup = carry != 1;
 197 #else
 198 #error need to define one of ORIGINAL, SUB_MUL_BORROW, or MUL_RSUB_CARRY
 199 #endif
 200
 201       q[j] = qhat;              // Store quotient digit.
 202       if (need_fixup) {         // If we subtracted too
 203          q[j] = q[j] - 1;       // much, add back.
 204          k = 0;
 205          for (i = 0; i < n; i++) {
 206             t = (unsigned long long)un[i+j] + vn[i] + k;
 207             un[i+j] = t;
 208             k = t >> 32;
 209          }
 210          un[j+n] = un[j+n] + k;
 211       }
 212    } // End j.
 213    // If the caller wants the remainder, unnormalize
 214    // it and pass it back.
 215    if (r != NULL) {
 216       for (i = 0; i < n-1; i++)
 217          r[i] = (un[i] >> s) | ((unsigned long long)un[i+1] << (32-s));
 218       r[n-1] = un[n-1] >> s;
 219    }
 220    return 0;
 221 }
 222
 223 int errors;
 224
 225 void check(unsigned q[], unsigned r[],
 226            unsigned u[], unsigned v[],
 227            int m, int n,
 228            unsigned cq[], unsigned cr[]) {
 229    int i, szq;
 230
 231    szq = max(m - n + 1, 1);
 232    for (i = 0; i < szq; i++) {
 233       if (q[i] != cq[i]) {
 234          errors = errors + 1;
 235          dumpit("Error, dividend u =", m, u);
 236          dumpit("       divisor  v =", n, v);
 237          dumpit("For quotient,  got:", m-n+1, q);
 238          dumpit("        Should get:", m-n+1, cq);
 239          return;
 240       }
 241    }
 242    for (i = 0; i < n; i++) {
 243       if (r[i] != cr[i]) {
 244          errors = errors + 1;
 245          dumpit("Error, dividend u =", m, u);
 246          dumpit("       divisor  v =", n, v);
 247          dumpit("For remainder, got:", n, r);
 248          dumpit("        Should get:", n, cr);
 249          return;
 250       }
 251    }
 252    return;
 253 }
 254
 255 int main() {
 256    static unsigned test[] = {
 257    // m, n, u...,          v...,          cq...,  cr....
 258       1, 1, 3,             0,             1,      1,            // Error, divide by 0.
 259       1, 2, 7,             1,3,           0,      7,0,          // Error, n > m.
 260       2, 2, 0,0,           1,0,           0,      0,0,          // Error, incorrect remainder cr.
 261       1, 1, 3,             2,             1,      1,
 262       1, 1, 3,             3,             1,      0,
 263       1, 1, 3,             4,             0,      3,
 264       1, 1, 0,             0xffffffff,    0,      0,
 265       1, 1, 0xffffffff,    1,             0xffffffff, 0,
 266       1, 1, 0xffffffff,    0xffffffff,    1,      0,
 267       1, 1, 0xffffffff,    3,             0x55555555, 0,
 268       2, 1, 0xffffffff,0xffffffff, 1,     0xffffffff,0xffffffff, 0,
 269       2, 1, 0xffffffff,0xffffffff, 0xffffffff,        1,1,    0,
 270       2, 1, 0xffffffff,0xfffffffe, 0xffffffff,        0xffffffff,0, 0xfffffffe,
 271       2, 1, 0x00005678,0x00001234, 0x00009abc,        0x1e1dba76,0, 0x6bd0,
 272       2, 2, 0,0,           0,1,           0,      0,0,
 273       2, 2, 0,7,           0,3,           2,      0,1,
 274       2, 2, 5,7,           0,3,           2,      5,1,
 275       2, 2, 0,6,           0,2,           3,      0,0,
 276       1, 1, 0x80000000,  0x40000001, 0x00000001, 0x3fffffff,
 277       2, 1, 0x00000000,0x80000000, 0x40000001, 0xfffffff8,0x00000001, 0x00000008,
 278       2, 2, 0x00000000,0x80000000, 0x00000001,0x40000000, 0x00000001, 0xffffffff,0x3fffffff,
 279       2, 2, 0x0000789a,0x0000bcde, 0x0000789a,0x0000bcde,          1,          0,0,
 280       2, 2, 0x0000789b,0x0000bcde, 0x0000789a,0x0000bcde,          1,          1,0,
 281       2, 2, 0x00007899,0x0000bcde, 0x0000789a,0x0000bcde,          0, 0x00007899,0x0000bcde,
 282       2, 2, 0x0000ffff,0x0000ffff, 0x0000ffff,0x0000ffff,          1,          0,0,
 283       2, 2, 0x0000ffff,0x0000ffff, 0x00000000,0x00000001, 0x0000ffff, 0x0000ffff,0,
 284       3, 2, 0x000089ab,0x00004567,0x00000123, 0x00000000,0x00000001,   0x00004567,0x00000123, 0x000089ab,0,
 285       3, 2, 0x00000000,0x0000fffe,0x00008000, 0x0000ffff,0x00008000,   0xffffffff,0x00000000, 0x0000ffff,0x00007fff, // Shows that first qhat can = b + 1.
 286       3, 3, 0x00000003,0x00000000,0x80000000, 0x00000001,0x00000000,0x20000000,   0x00000003, 0,0,0x20000000, // Adding back step req'd.
 287       3, 3, 0x00000003,0x00000000,0x00008000, 0x00000001,0x00000000,0x00002000,   0x00000003, 0,0,0x00002000, // Adding back step req'd.
 288       4, 3, 0,0,0x00008000,0x00007fff, 1,0,0x00008000,   0xfffe0000,0, 0x00020000,0xffffffff,0x00007fff,  // Add back req'd.
 289       4, 3, 0,0x0000fffe,0,0x00008000, 0x0000ffff,0,0x00008000, 0xffffffff,0, 0x0000ffff,0xffffffff,0x00007fff,  // Shows that mult-sub quantity cannot be treated as signed.
 290       4, 3, 0,0xfffffffe,0,0x80000000, 0x0000ffff,0,0x80000000, 0x00000000,1, 0x00000000,0xfffeffff,0x00000000,  // Shows that mult-sub quantity cannot be treated as signed.
 291       4, 3, 0,0xfffffffe,0,0x80000000, 0xffffffff,0,0x80000000, 0xffffffff,0, 0xffffffff,0xffffffff,0x7fffffff,  // Shows that mult-sub quantity cannot be treated as signed.
 292    };
 293    int i, n, m, ncases, f;
 294    unsigned q[10], r[10];
 295    unsigned *u, *v, *cq, *cr;
 296
 297    printf("divmnu:\n");
 298    i = 0;
 299    ncases = 0;
 300    while (i < sizeof(test)/4) {
 301       m = test[i];
 302       n = test[i+1];
 303       u = &test[i+2];
 304       v = &test[i+2+m];
 305       cq = &test[i+2+m+n];
 306       cr = &test[i+2+m+n+max(m-n+1, 1)];
 307
 308       f = divmnu(q, r, u, v, m, n);
 309       if (f) {
 310          dumpit("Error return code for dividend u =", m, u);
 311          dumpit("                      divisor  v =", n, v);
 312          errors = errors + 1;
 313       }
 314       else
 315          check(q, r, u, v, m, n, cq, cr);
 316       i = i + 2 + m + n + max(m-n+1, 1) + n;
 317       ncases = ncases + 1;
 318    }
 319
 320    printf("%d errors out of %d cases; there should be 3.\n", errors, ncases);
 321    return 0;
 322 }