openpower/sv/bitmanip/divmnu64.c

   1 /* original source code from Hackers-Delight
   2    https://github.com/hcs0/Hackers-Delight
   3 */
   4 /* This divides an n-word dividend by an m-word divisor, giving an
   5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
   6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
   7 machine which has a 64/64 division instruction. */
   8
   9 #include <stdio.h>
  10 #include <stdlib.h>     //To define "exit", req'd by XLC.
  11 #include <stdbool.h>
  12 #include <stdint.h>
  13
  14 #define max(x, y) ((x) > (y) ? (x) : (y))
  15
  16 int nlz(unsigned x) {
  17    int n;
  18
  19    if (x == 0) return(32);
  20    n = 0;
  21    if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
  22    if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
  23    if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
  24    if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
  25    if (x <= 0x7FFFFFFF) {n = n + 1;}
  26    return n;
  27 }
  28
  29 void dumpit(char *msg, int n, unsigned v[]) {
  30    int i;
  31    printf("%s", msg);
  32    for (i = n-1; i >= 0; i--) printf(" %08x", v[i]);
  33    printf("\n");
  34 }
  35
  36 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
  37 (The sequence is in little-endian order).
  38
  39 This is a fairly precise implementation of Knuth's Algorithm D, for a
  40 binary computer with base b = 2**32. The caller supplies:
  41    1. Space q for the quotient, m - n + 1 words (at least one).
  42    2. Space r for the remainder (optional), n words.
  43    3. The dividend u, m words, m >= 1.
  44    4. The divisor v, n words, n >= 2.
  45 The most significant digit of the divisor, v[n-1], must be nonzero.  The
  46 dividend u may have leading zeros; this just makes the algorithm take
  47 longer and makes the quotient contain more leading zeros.  A value of
  48 NULL may be given for the address of the remainder to signify that the
  49 caller does not want the remainder.
  50    The program does not alter the input parameters u and v.
  51    The quotient and remainder returned may have leading zeros.  The
  52 function itself returns a value of 0 for success and 1 for invalid
  53 parameters (e.g., division by 0).
  54    For now, we must have m >= n.  Knuth's Algorithm D also requires
  55 that the dividend be at least as long as the divisor.  (In his terms,
  56 m >= 0 (unstated).  Therefore m+n >= n.) */
  57
  58 int divmnu(unsigned q[], unsigned r[],
  59      const unsigned u[], const unsigned v[],
  60      int m, int n) {
  61
  62    const unsigned long long b = 4294967296LL; // Number base (2**32).
  63    unsigned *un, *vn;                         // Normalized form of u, v.
  64    unsigned long long qhat;                   // Estimated quotient digit.
  65    unsigned long long rhat;                   // A remainder.
  66    unsigned long long p;                      // Product of two digits.
  67    long long t, k;
  68    int s, i, j;
  69
  70    if (m < n || n <= 0 || v[n-1] == 0)
  71       return 1;                         // Return if invalid param.
  72
  73    if (n == 1) {                        // Take care of
  74       k = 0;                            // the case of a
  75       for (j = m - 1; j >= 0; j--) {    // single-digit
  76          q[j] = (k*b + u[j])/v[0];      // divisor here.
  77          k = (k*b + u[j]) - q[j]*v[0];
  78       }
  79       if (r != NULL) r[0] = k;
  80       return 0;
  81    }
  82
  83    /* Normalize by shifting v left just enough so that its high-order
  84    bit is on, and shift u left the same amount. We may have to append a
  85    high-order digit on the dividend; we do that unconditionally. */
  86
  87    s = nlz(v[n-1]);             // 0 <= s <= 31.
  88    vn = (unsigned *)alloca(4*n);
  89    for (i = n - 1; i > 0; i--)
  90       vn[i] = (v[i] << s) | ((unsigned long long)v[i-1] >> (32-s));
  91    vn[0] = v[0] << s;
  92
  93    un = (unsigned *)alloca(4*(m + 1));
  94    un[m] = (unsigned long long)u[m-1] >> (32-s);
  95    for (i = m - 1; i > 0; i--)
  96       un[i] = (u[i] << s) | ((unsigned long long)u[i-1] >> (32-s));
  97    un[0] = u[0] << s;
  98
  99    for (j = m - n; j >= 0; j--) {       // Main loop.
 100       // Compute estimate qhat of q[j].
 101       qhat = (un[j+n]*b + un[j+n-1])/vn[n-1];
 102       rhat = (un[j+n]*b + un[j+n-1]) - qhat*vn[n-1];
 103 again:
 104       if (qhat >= b || qhat*vn[n-2] > b*rhat + un[j+n-2])
 105       { qhat = qhat - 1;
 106         rhat = rhat + vn[n-1];
 107         if (rhat < b) goto again;
 108       }
 109 #define MUL_RSUB_CARRY_2_STAGE2
 110 #ifdef ORIGINAL
 111       // Multiply and subtract.
 112       k = 0;
 113       for (i = 0; i < n; i++) {
 114          p = qhat*vn[i];
 115          t = un[i+j] - k - (p & 0xFFFFFFFFLL);
 116          un[i+j] = t;
 117          k = (p >> 32) - (t >> 32);
 118       }
 119       t = un[j+n] - k;
 120       un[j+n] = t;
 121       bool need_fixup = t < 0;
 122 #elif defined(SUB_MUL_BORROW)
 123       (void)p; // shut up unused variable warning
 124
 125       // Multiply and subtract.
 126       uint32_t borrow = 0;
 127       for(int i = 0; i <= n; i++) {
 128          uint32_t vn_i = i < n ? vn[i] : 0;
 129          uint64_t value = un[i + j] - (uint64_t)qhat * vn_i - borrow;
 130          borrow = -(uint32_t)(value >> 32);
 131          un[i + j] = (uint32_t)value;
 132       }
 133       bool need_fixup = borrow != 0;
 134 #elif defined(MUL_RSUB_CARRY)
 135       (void)p; // shut up unused variable warning
 136
 137       // Multiply and subtract.
 138       uint32_t carry = 1;
 139       for(int i = 0; i <= n; i++) {
 140          uint32_t vn_i = i < n ? vn[i] : 0;
 141          uint64_t result = un[i + j] + ~((uint64_t)qhat * vn_i) + carry;
 142          uint32_t result_high = result >> 32;
 143          if(carry <= 1)
 144             result_high++;
 145          carry = result_high;
 146          un[i + j] = (uint32_t)result;
 147       }
 148       bool need_fixup = carry != 1;
 149 #elif defined(SUB_MUL_BORROW_2_STAGE)
 150       (void)p; // shut up unused variable warning
 151
 152       // Multiply and subtract.
 153       uint32_t borrow = 0;
 154       uint32_t phi[2000]; // plenty space
 155       uint32_t plo[2000]; // plenty space
 156       // first, perform mul-and-sub and store in split hi-lo
 157       // this shows the vectorised sv.msubx which stores 128-bit in
 158       // two 64-bit registers
 159       for(int i = 0; i <= n; i++) {
 160          uint32_t vn_i = i < n ? vn[i] : 0;
 161          uint64_t value = un[i + j] - (uint64_t)qhat * vn_i;
 162          plo[i] = value & 0xffffffffLL;
 163          phi[i] = value >> 32;
 164       }
 165       // second, reconstruct the 64-bit result, subtract borrow,
 166       // store top-half (-ve) in new borrow and store low-half as answer
 167       // this is the new (odd) instruction
 168       for(int i = 0; i <= n; i++) {
 169          uint64_t value = (((uint64_t)phi[i]<<32) | plo[i]) - borrow;
 170          borrow = ~(value >> 32)+1; // -(uint32_t)(value >> 32);
 171          un[i + j] = (uint32_t)value;
 172       }
 173       bool need_fixup = borrow != 0;
 174 #elif defined(MUL_RSUB_CARRY_2_STAGE)
 175       (void)p; // shut up unused variable warning
 176
 177       // Multiply and subtract.
 178       uint32_t carry = 1;
 179       uint32_t phi[2000]; // plenty space
 180       uint32_t plo[2000]; // plenty space
 181       for(int i = 0; i <= n; i++) {
 182          uint32_t vn_i = i < n ? vn[i] : 0;
 183          uint64_t value = un[i + j] + ~((uint64_t)qhat * vn_i);
 184          plo[i] = value & 0xffffffffLL;
 185          phi[i] = value >> 32;
 186       }
 187       for(int i = 0; i <= n; i++) {
 188          uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry;
 189          uint32_t result_high = result >> 32;
 190          if(carry <= 1)
 191             result_high++;
 192          carry = result_high;
 193          un[i + j] = (uint32_t)result;
 194       }
 195       bool need_fixup = carry != 1;
 196 #elif defined(MUL_RSUB_CARRY_2_STAGE1)
 197       (void)p; // shut up unused variable warning
 198
 199       // Multiply and subtract.
 200       uint32_t carry = 1;
 201       uint32_t phi[2000]; // plenty space
 202       uint32_t plo[2000]; // plenty space
 203       // same mul-and-sub as SUB_MUL_BORROW but not the same
 204       // mul-and-sub-minus-one as MUL_RSUB_CARRY
 205       for(int i = 0; i <= n; i++) {
 206          uint32_t vn_i = i < n ? vn[i] : 0;
 207          uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
 208          plo[i] = value & 0xffffffffLL;
 209          phi[i] = value >> 32;
 210       }
 211       // compensate for the +1 that was added by mul-and-sub by subtracting
 212       // it here (as ~(0))
 213       for(int i = 0; i <= n; i++) {
 214          uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry+
 215                            ~(0); // a way to express "-1"
 216          uint32_t result_high = result >> 32;
 217          if(carry <= 1)
 218             result_high++;
 219          carry = result_high;
 220          un[i + j] = (uint32_t)result;
 221       }
 222       bool need_fixup = carry != 1;
 223 #elif defined(MUL_RSUB_CARRY_2_STAGE2)
 224       (void)p; // shut up unused variable warning
 225
 226       // Multiply and subtract.
 227       uint32_t carry = 0;
 228       uint32_t phi[2000]; // plenty space
 229       uint32_t plo[2000]; // plenty space
 230       // same mul-and-sub as SUB_MUL_BORROW but not the same
 231       // mul-and-sub-minus-one as MUL_RSUB_CARRY
 232       for(int i = 0; i <= n; i++) {
 233          uint32_t vn_i = i < n ? vn[i] : 0;
 234          uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
 235          plo[i] = value & 0xffffffffLL;
 236          phi[i] = value >> 32;
 237       }
 238       // NOW it starts to make sense. when no carry this time, next
 239       // carry as-is. rlse next carry reduces by one.
 240       // it here (as ~(0))
 241       for(int i = 0; i <= n; i++) {
 242          uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry;
 243          uint32_t result_high = result >> 32;
 244          if(carry == 0)
 245             carry = result_high;
 246          else
 247             carry = result_high-1;
 248          un[i + j] = (uint32_t)result;
 249       }
 250       bool need_fixup = carry != 0;
 251 #else
 252 #error need to choose one of the algorithm options; e.g. -DORIGINAL
 253 #endif
 254
 255       q[j] = qhat;              // Store quotient digit.
 256       if (need_fixup) {         // If we subtracted too
 257          q[j] = q[j] - 1;       // much, add back.
 258          k = 0;
 259          for (i = 0; i < n; i++) {
 260             t = (unsigned long long)un[i+j] + vn[i] + k;
 261             un[i+j] = t;
 262             k = t >> 32;
 263          }
 264          un[j+n] = un[j+n] + k;
 265       }
 266    } // End j.
 267    // If the caller wants the remainder, unnormalize
 268    // it and pass it back.
 269    if (r != NULL) {
 270       for (i = 0; i < n-1; i++)
 271          r[i] = (un[i] >> s) | ((unsigned long long)un[i+1] << (32-s));
 272       r[n-1] = un[n-1] >> s;
 273    }
 274    return 0;
 275 }
 276
 277 int errors;
 278
 279 void check(unsigned q[], unsigned r[],
 280            unsigned u[], unsigned v[],
 281            int m, int n,
 282            unsigned cq[], unsigned cr[]) {
 283    int i, szq;
 284
 285    szq = max(m - n + 1, 1);
 286    for (i = 0; i < szq; i++) {
 287       if (q[i] != cq[i]) {
 288          errors = errors + 1;
 289          dumpit("Error, dividend u =", m, u);
 290          dumpit("       divisor  v =", n, v);
 291          dumpit("For quotient,  got:", m-n+1, q);
 292          dumpit("        Should get:", m-n+1, cq);
 293          return;
 294       }
 295    }
 296    for (i = 0; i < n; i++) {
 297       if (r[i] != cr[i]) {
 298          errors = errors + 1;
 299          dumpit("Error, dividend u =", m, u);
 300          dumpit("       divisor  v =", n, v);
 301          dumpit("For remainder, got:", n, r);
 302          dumpit("        Should get:", n, cr);
 303          return;
 304       }
 305    }
 306    return;
 307 }
 308
 309 int main() {
 310    static unsigned test[] = {
 311    // m, n, u...,          v...,          cq...,  cr....
 312       1, 1, 3,             0,             1,      1,            // Error, divide by 0.
 313       1, 2, 7,             1,3,           0,      7,0,          // Error, n > m.
 314       2, 2, 0,0,           1,0,           0,      0,0,          // Error, incorrect remainder cr.
 315       1, 1, 3,             2,             1,      1,
 316       1, 1, 3,             3,             1,      0,
 317       1, 1, 3,             4,             0,      3,
 318       1, 1, 0,             0xffffffff,    0,      0,
 319       1, 1, 0xffffffff,    1,             0xffffffff, 0,
 320       1, 1, 0xffffffff,    0xffffffff,    1,      0,
 321       1, 1, 0xffffffff,    3,             0x55555555, 0,
 322       2, 1, 0xffffffff,0xffffffff, 1,     0xffffffff,0xffffffff, 0,
 323       2, 1, 0xffffffff,0xffffffff, 0xffffffff,        1,1,    0,
 324       2, 1, 0xffffffff,0xfffffffe, 0xffffffff,        0xffffffff,0, 0xfffffffe,
 325       2, 1, 0x00005678,0x00001234, 0x00009abc,        0x1e1dba76,0, 0x6bd0,
 326       2, 2, 0,0,           0,1,           0,      0,0,
 327       2, 2, 0,7,           0,3,           2,      0,1,
 328       2, 2, 5,7,           0,3,           2,      5,1,
 329       2, 2, 0,6,           0,2,           3,      0,0,
 330       1, 1, 0x80000000,  0x40000001, 0x00000001, 0x3fffffff,
 331       2, 1, 0x00000000,0x80000000, 0x40000001, 0xfffffff8,0x00000001, 0x00000008,
 332       2, 2, 0x00000000,0x80000000, 0x00000001,0x40000000, 0x00000001, 0xffffffff,0x3fffffff,
 333       2, 2, 0x0000789a,0x0000bcde, 0x0000789a,0x0000bcde,          1,          0,0,
 334       2, 2, 0x0000789b,0x0000bcde, 0x0000789a,0x0000bcde,          1,          1,0,
 335       2, 2, 0x00007899,0x0000bcde, 0x0000789a,0x0000bcde,          0, 0x00007899,0x0000bcde,
 336       2, 2, 0x0000ffff,0x0000ffff, 0x0000ffff,0x0000ffff,          1,          0,0,
 337       2, 2, 0x0000ffff,0x0000ffff, 0x00000000,0x00000001, 0x0000ffff, 0x0000ffff,0,
 338       3, 2, 0x000089ab,0x00004567,0x00000123, 0x00000000,0x00000001,   0x00004567,0x00000123, 0x000089ab,0,
 339       3, 2, 0x00000000,0x0000fffe,0x00008000, 0x0000ffff,0x00008000,   0xffffffff,0x00000000, 0x0000ffff,0x00007fff, // Shows that first qhat can = b + 1.
 340       3, 3, 0x00000003,0x00000000,0x80000000, 0x00000001,0x00000000,0x20000000,   0x00000003, 0,0,0x20000000, // Adding back step req'd.
 341       3, 3, 0x00000003,0x00000000,0x00008000, 0x00000001,0x00000000,0x00002000,   0x00000003, 0,0,0x00002000, // Adding back step req'd.
 342       4, 3, 0,0,0x00008000,0x00007fff, 1,0,0x00008000,   0xfffe0000,0, 0x00020000,0xffffffff,0x00007fff,  // Add back req'd.
 343       4, 3, 0,0x0000fffe,0,0x00008000, 0x0000ffff,0,0x00008000, 0xffffffff,0, 0x0000ffff,0xffffffff,0x00007fff,  // Shows that mult-sub quantity cannot be treated as signed.
 344       4, 3, 0,0xfffffffe,0,0x80000000, 0x0000ffff,0,0x80000000, 0x00000000,1, 0x00000000,0xfffeffff,0x00000000,  // Shows that mult-sub quantity cannot be treated as signed.
 345       4, 3, 0,0xfffffffe,0,0x80000000, 0xffffffff,0,0x80000000, 0xffffffff,0, 0xffffffff,0xffffffff,0x7fffffff,  // Shows that mult-sub quantity cannot be treated as signed.
 346    };
 347    int i, n, m, ncases, f;
 348    unsigned q[10], r[10];
 349    unsigned *u, *v, *cq, *cr;
 350
 351    printf("divmnu:\n");
 352    i = 0;
 353    ncases = 0;
 354    while (i < sizeof(test)/4) {
 355       m = test[i];
 356       n = test[i+1];
 357       u = &test[i+2];
 358       v = &test[i+2+m];
 359       cq = &test[i+2+m+n];
 360       cr = &test[i+2+m+n+max(m-n+1, 1)];
 361
 362       f = divmnu(q, r, u, v, m, n);
 363       if (f) {
 364          dumpit("Error return code for dividend u =", m, u);
 365          dumpit("                      divisor  v =", n, v);
 366          errors = errors + 1;
 367       }
 368       else
 369          check(q, r, u, v, m, n, cq, cr);
 370       i = i + 2 + m + n + max(m-n+1, 1) + n;
 371       ncases = ncases + 1;
 372    }
 373
 374    printf("%d errors out of %d cases; there should be 3.\n", errors, ncases);
 375    return 0;
 376 }