openpower/sv/biginteger/divmnu64.c

   1 /* original source code from Hackers-Delight
   2    https://github.com/hcs0/Hackers-Delight
   3 */
   4 /* This divides an n-word dividend by an m-word divisor, giving an
   5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
   6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
   7 machine which has a 64/64 division instruction. */
   8
   9 #include <stdbool.h>
  10 #include <stdint.h>
  11 #include <stdio.h>
  12 #include <stdlib.h> //To define "exit", req'd by XLC.
  13
  14 #define max(x, y) ((x) > (y) ? (x) : (y))
  15
  16 int nlz(unsigned x)
  17 {
  18     int n;
  19
  20     if (x == 0)
  21         return (32);
  22     n = 0;
  23     if (x <= 0x0000FFFF)
  24     {
  25         n = n + 16;
  26         x = x << 16;
  27     }
  28     if (x <= 0x00FFFFFF)
  29     {
  30         n = n + 8;
  31         x = x << 8;
  32     }
  33     if (x <= 0x0FFFFFFF)
  34     {
  35         n = n + 4;
  36         x = x << 4;
  37     }
  38     if (x <= 0x3FFFFFFF)
  39     {
  40         n = n + 2;
  41         x = x << 2;
  42     }
  43     if (x <= 0x7FFFFFFF)
  44     {
  45         n = n + 1;
  46     }
  47     return n;
  48 }
  49
  50 void dumpit(char *msg, int n, unsigned v[])
  51 {
  52     int i;
  53     printf("%s", msg);
  54     for (i = n - 1; i >= 0; i--)
  55         printf(" %08x", v[i]);
  56     printf("\n");
  57 }
  58
  59 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
  60 (The sequence is in little-endian order).
  61
  62 This is a fairly precise implementation of Knuth's Algorithm D, for a
  63 binary computer with base b = 2**32. The caller supplies:
  64    1. Space q for the quotient, m - n + 1 words (at least one).
  65    2. Space r for the remainder (optional), n words.
  66    3. The dividend u, m words, m >= 1.
  67    4. The divisor v, n words, n >= 2.
  68 The most significant digit of the divisor, v[n-1], must be nonzero.  The
  69 dividend u may have leading zeros; this just makes the algorithm take
  70 longer and makes the quotient contain more leading zeros.  A value of
  71 NULL may be given for the address of the remainder to signify that the
  72 caller does not want the remainder.
  73    The program does not alter the input parameters u and v.
  74    The quotient and remainder returned may have leading zeros.  The
  75 function itself returns a value of 0 for success and 1 for invalid
  76 parameters (e.g., division by 0).
  77    For now, we must have m >= n.  Knuth's Algorithm D also requires
  78 that the dividend be at least as long as the divisor.  (In his terms,
  79 m >= 0 (unstated).  Therefore m+n >= n.) */
  80
  81 int divmnu(unsigned q[], unsigned r[], const unsigned u[], const unsigned v[],
  82            int m, int n)
  83 {
  84
  85     const unsigned long long b = 1LL << 32; // Number base (2**32).
  86     unsigned *un, *vn;                      // Normalized form of u, v.
  87     unsigned long long qhat;                // Estimated quotient digit.
  88     unsigned long long rhat;                // A remainder.
  89     unsigned long long p;                   // Product of two digits.
  90     long long t, k;
  91     int s, i, j;
  92
  93     if (m < n || n <= 0 || v[n - 1] == 0)
  94         return 1; // Return if invalid param.
  95
  96     if (n == 1)
  97     {          // Take care of
  98         k = 0; // the case of a
  99         for (j = m - 1; j >= 0; j--)
 100         {                                 // single-digit
 101             q[j] = (k * b + u[j]) / v[0]; // divisor here.
 102             k = (k * b + u[j]) - q[j] * v[0];
 103         }
 104         if (r != NULL)
 105             r[0] = k;
 106         return 0;
 107     }
 108
 109     /* Normalize by shifting v left just enough so that its high-order
 110     bit is on, and shift u left the same amount. We may have to append a
 111     high-order digit on the dividend; we do that unconditionally. */
 112
 113     s = nlz(v[n - 1]); // 0 <= s <= 31.
 114     vn = (unsigned *)alloca(4 * n);
 115     for (i = n - 1; i > 0; i--)
 116         vn[i] = (v[i] << s) | ((unsigned long long)v[i - 1] >> (32 - s));
 117     vn[0] = v[0] << s;
 118
 119     un = (unsigned *)alloca(4 * (m + 1));
 120     un[m] = (unsigned long long)u[m - 1] >> (32 - s);
 121     for (i = m - 1; i > 0; i--)
 122         un[i] = (u[i] << s) | ((unsigned long long)u[i - 1] >> (32 - s));
 123     un[0] = u[0] << s;
 124
 125     for (j = m - n; j >= 0; j--)
 126     { // Main loop.
 127         // Compute estimate qhat of q[j] from top 2 digits
 128         uint64_t dig2 = ((uint64_t)un[j + n] << 32) | un[j + n - 1];
 129         qhat = dig2 / vn[n - 1];
 130         rhat = dig2 % vn[n - 1];
 131     again:
 132         // use 3rd-from-top digit to obtain better accuracy
 133         if (qhat >= b || qhat * vn[n - 2] > b * rhat + un[j + n - 2])
 134         {
 135             qhat = qhat - 1;
 136             rhat = rhat + vn[n - 1];
 137             if (rhat < b)
 138                 goto again;
 139         }
 140 // don't define here, allowing easily testing all options by passing -D...
 141 // #define MUL_RSUB_CARRY_2_STAGE2
 142 #ifdef ORIGINAL
 143         // Multiply and subtract.
 144         k = 0;
 145         for (i = 0; i < n; i++)
 146         {
 147             p = qhat * vn[i];
 148             t = un[i + j] - k - (p & 0xFFFFFFFFLL);
 149             un[i + j] = t;
 150             k = (p >> 32) - (t >> 32);
 151         }
 152         t = un[j + n] - k;
 153         un[j + n] = t;
 154         bool need_fixup = t < 0;
 155 #elif defined(SUB_MUL_BORROW)
 156         (void)p; // shut up unused variable warning
 157
 158         // Multiply and subtract.
 159         uint32_t borrow = 0;
 160         for (int i = 0; i <= n; i++)
 161         {
 162             uint32_t vn_i = i < n ? vn[i] : 0;
 163             uint64_t value = un[i + j] - (uint64_t)qhat * vn_i - borrow;
 164             borrow = -(uint32_t)(value >> 32);
 165             un[i + j] = (uint32_t)value;
 166         }
 167         bool need_fixup = borrow != 0;
 168 #elif defined(MUL_RSUB_CARRY)
 169         (void)p; // shut up unused variable warning
 170
 171         // Multiply and subtract.
 172         uint32_t carry = 1;
 173         for (int i = 0; i <= n; i++)
 174         {
 175             uint32_t vn_i = i < n ? vn[i] : 0;
 176             uint64_t result = un[i + j] + ~((uint64_t)qhat * vn_i) + carry;
 177             uint32_t result_high = result >> 32;
 178             if (carry <= 1)
 179                 result_high++;
 180             carry = result_high;
 181             un[i + j] = (uint32_t)result;
 182         }
 183         bool need_fixup = carry != 1;
 184 #elif defined(SUB_MUL_BORROW_2_STAGE)
 185         (void)p; // shut up unused variable warning
 186
 187         // Multiply and subtract.
 188         uint32_t borrow = 0;
 189         uint32_t phi[2000]; // plenty space
 190         uint32_t plo[2000]; // plenty space
 191         // first, perform mul-and-sub and store in split hi-lo
 192         // this shows the vectorised sv.msubx which stores 128-bit in
 193         // two 64-bit registers
 194         for (int i = 0; i <= n; i++)
 195         {
 196             uint32_t vn_i = i < n ? vn[i] : 0;
 197             uint64_t value = un[i + j] - (uint64_t)qhat * vn_i;
 198             plo[i] = value & 0xffffffffLL;
 199             phi[i] = value >> 32;
 200         }
 201         // second, reconstruct the 64-bit result, subtract borrow,
 202         // store top-half (-ve) in new borrow and store low-half as answer
 203         // this is the new (odd) instruction
 204         for (int i = 0; i <= n; i++)
 205         {
 206             uint64_t value = (((uint64_t)phi[i] << 32) | plo[i]) - borrow;
 207             borrow = ~(value >> 32) + 1; // -(uint32_t)(value >> 32);
 208             un[i + j] = (uint32_t)value;
 209         }
 210         bool need_fixup = borrow != 0;
 211 #elif defined(MUL_RSUB_CARRY_2_STAGE)
 212         (void)p; // shut up unused variable warning
 213
 214         // Multiply and subtract.
 215         uint32_t carry = 1;
 216         uint32_t phi[2000]; // plenty space
 217         uint32_t plo[2000]; // plenty space
 218         for (int i = 0; i <= n; i++)
 219         {
 220             uint32_t vn_i = i < n ? vn[i] : 0;
 221             uint64_t value = un[i + j] + ~((uint64_t)qhat * vn_i);
 222             plo[i] = value & 0xffffffffLL;
 223             phi[i] = value >> 32;
 224         }
 225         for (int i = 0; i <= n; i++)
 226         {
 227             uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry;
 228             uint32_t result_high = result >> 32;
 229             if (carry <= 1)
 230                 result_high++;
 231             carry = result_high;
 232             un[i + j] = (uint32_t)result;
 233         }
 234         bool need_fixup = carry != 1;
 235 #elif defined(MUL_RSUB_CARRY_2_STAGE1)
 236         (void)p; // shut up unused variable warning
 237
 238         // Multiply and subtract.
 239         uint32_t carry = 1;
 240         uint32_t phi[2000]; // plenty space
 241         uint32_t plo[2000]; // plenty space
 242         // same mul-and-sub as SUB_MUL_BORROW but not the same
 243         // mul-and-sub-minus-one as MUL_RSUB_CARRY
 244         for (int i = 0; i <= n; i++)
 245         {
 246             uint32_t vn_i = i < n ? vn[i] : 0;
 247             uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
 248             plo[i] = value & 0xffffffffLL;
 249             phi[i] = value >> 32;
 250         }
 251         // compensate for the +1 that was added by mul-and-sub by subtracting
 252         // it here (as ~(0))
 253         for (int i = 0; i <= n; i++)
 254         {
 255             uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry +
 256                               ~(0); // a way to express "-1"
 257             uint32_t result_high = result >> 32;
 258             if (carry <= 1)
 259                 result_high++;
 260             carry = result_high;
 261             un[i + j] = (uint32_t)result;
 262         }
 263         bool need_fixup = carry != 1;
 264 #elif defined(MUL_RSUB_CARRY_2_STAGE2)
 265         (void)p; // shut up unused variable warning
 266
 267         // Multiply and subtract.
 268         uint32_t carry = 0;
 269         uint32_t phi[2000]; // plenty space
 270         uint32_t plo[2000]; // plenty space
 271         // same mul-and-sub as SUB_MUL_BORROW but not the same
 272         // mul-and-sub-minus-one as MUL_RSUB_CARRY
 273         for (int i = 0; i <= n; i++)
 274         {
 275             uint32_t vn_i = i < n ? vn[i] : 0;
 276             uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
 277             plo[i] = value & 0xffffffffLL;
 278             phi[i] = value >> 32;
 279         }
 280         // NOW it starts to make sense. when no carry this time, next
 281         // carry as-is. rlse next carry reduces by one.
 282         // it here (as ~(0))
 283         for (int i = 0; i <= n; i++)
 284         {
 285             uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry;
 286             uint32_t result_high = result >> 32;
 287             if (carry == 0)
 288                 carry = result_high;
 289             else
 290                 carry = result_high - 1;
 291             un[i + j] = (uint32_t)result;
 292         }
 293         bool need_fixup = carry != 0;
 294 #elif defined(MADDED_SUBFE)
 295         (void)p; // shut up unused variable warning
 296
 297         // Multiply and subtract.
 298         uint32_t carry = 0;
 299         uint32_t product[n + 1];
 300         // VL = n + 1
 301         // sv.madded product.v, vn.v, qhat.s, carry.s
 302         for (int i = 0; i <= n; i++)
 303         {
 304             uint32_t vn_v = i < n ? vn[i] : 0;
 305             uint64_t value = (uint64_t)vn_v * (uint64_t)qhat + carry;
 306             carry = (uint32_t)(value >> 32);
 307             product[i] = (uint32_t)value;
 308         }
 309         bool ca = true;
 310         uint32_t *un_j = &un[j];
 311         // VL = n + 1
 312         // sv.subfe un_j.v, product.v, un_j.v
 313         for (int i = 0; i <= n; i++)
 314         {
 315             uint64_t value = (uint64_t)~product[i] + (uint64_t)un_j[i] + ca;
 316             ca = value >> 32 != 0;
 317             un_j[i] = value;
 318         }
 319         bool need_fixup = !ca;
 320 #else
 321 #error need to choose one of the algorithm options; e.g. -DORIGINAL
 322 #endif
 323
 324         q[j] = qhat; // Store quotient digit.
 325         if (need_fixup)
 326         {                    // If we subtracted too
 327             q[j] = q[j] - 1; // much, add back.
 328             k = 0;
 329             for (i = 0; i < n; i++)
 330             {
 331                 t = (unsigned long long)un[i + j] + vn[i] + k;
 332                 un[i + j] = t;
 333                 k = t >> 32;
 334             }
 335             un[j + n] = un[j + n] + k;
 336         }
 337     } // End j.
 338     // If the caller wants the remainder, unnormalize
 339     // it and pass it back.
 340     if (r != NULL)
 341     {
 342         for (i = 0; i < n - 1; i++)
 343             r[i] = (un[i] >> s) | ((unsigned long long)un[i + 1] << (32 - s));
 344         r[n - 1] = un[n - 1] >> s;
 345     }
 346     return 0;
 347 }
 348
 349 int errors;
 350
 351 void check(unsigned q[], unsigned r[], unsigned u[], unsigned v[], int m,
 352            int n, unsigned cq[], unsigned cr[])
 353 {
 354     int i, szq;
 355
 356     szq = max(m - n + 1, 1);
 357     for (i = 0; i < szq; i++)
 358     {
 359         if (q[i] != cq[i])
 360         {
 361             errors = errors + 1;
 362             dumpit("Error, dividend u =", m, u);
 363             dumpit("       divisor  v =", n, v);
 364             dumpit("For quotient,  got:", m - n + 1, q);
 365             dumpit("        Should get:", m - n + 1, cq);
 366             return;
 367         }
 368     }
 369     for (i = 0; i < n; i++)
 370     {
 371         if (r[i] != cr[i])
 372         {
 373             errors = errors + 1;
 374             dumpit("Error, dividend u =", m, u);
 375             dumpit("       divisor  v =", n, v);
 376             dumpit("For remainder, got:", n, r);
 377             dumpit("        Should get:", n, cr);
 378             return;
 379         }
 380     }
 381     return;
 382 }
 383
 384 int main()
 385 {
 386     static struct
 387     {
 388         int m, n;
 389         uint32_t u[10], v[10], cq[10], cr[10];
 390         bool error;
 391     } test[] = {
 392         // clang-format off
 393         {.m=1, .n=1, .u={3}, .v={0}, .cq={1}, .cr={1}, .error=true}, // Error, divide by 0.
 394         {.m=1, .n=2, .u={7}, .v={1,3}, .cq={0}, .cr={7,0}, .error=true}, // Error, n > m.
 395         {.m=2, .n=2, .u={0,0}, .v={1,0}, .cq={0}, .cr={0,0}, .error=true}, // Error, incorrect remainder cr.
 396         {.m=1, .n=1, .u={3}, .v={2}, .cq={1}, .cr={1}},
 397         {.m=1, .n=1, .u={3}, .v={3}, .cq={1}, .cr={0}},
 398         {.m=1, .n=1, .u={3}, .v={4}, .cq={0}, .cr={3}},
 399         {.m=1, .n=1, .u={0}, .v={0xffffffff}, .cq={0}, .cr={0}},
 400         {.m=1, .n=1, .u={0xffffffff}, .v={1}, .cq={0xffffffff}, .cr={0}},
 401         {.m=1, .n=1, .u={0xffffffff}, .v={0xffffffff}, .cq={1}, .cr={0}},
 402         {.m=1, .n=1, .u={0xffffffff}, .v={3}, .cq={0x55555555}, .cr={0}},
 403         {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={1}, .cq={0xffffffff,0xffffffff}, .cr={0}},
 404         {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={0xffffffff}, .cq={1,1}, .cr={0}},
 405         {.m=2, .n=1, .u={0xffffffff,0xfffffffe}, .v={0xffffffff}, .cq={0xffffffff,0}, .cr={0xfffffffe}},
 406         {.m=2, .n=1, .u={0x00005678,0x00001234}, .v={0x00009abc}, .cq={0x1e1dba76,0}, .cr={0x6bd0}},
 407         {.m=2, .n=2, .u={0,0}, .v={0,1}, .cq={0}, .cr={0,0}},
 408         {.m=2, .n=2, .u={0,7}, .v={0,3}, .cq={2}, .cr={0,1}},
 409         {.m=2, .n=2, .u={5,7}, .v={0,3}, .cq={2}, .cr={5,1}},
 410         {.m=2, .n=2, .u={0,6}, .v={0,2}, .cq={3}, .cr={0,0}},
 411         {.m=1, .n=1, .u={0x80000000}, .v={0x40000001}, .cq={0x00000001}, .cr={0x3fffffff}},
 412         {.m=2, .n=1, .u={0x00000000,0x80000000}, .v={0x40000001}, .cq={0xfffffff8,0x00000001}, .cr={0x00000008}},
 413         {.m=2, .n=2, .u={0x00000000,0x80000000}, .v={0x00000001,0x40000000}, .cq={0x00000001}, .cr={0xffffffff,0x3fffffff}},
 414         {.m=2, .n=2, .u={0x0000789a,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={0,0}},
 415         {.m=2, .n=2, .u={0x0000789b,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={1,0}},
 416         {.m=2, .n=2, .u={0x00007899,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={0}, .cr={0x00007899,0x0000bcde}},
 417         {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x0000ffff,0x0000ffff}, .cq={1}, .cr={0,0}},
 418         {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x00000000,0x00000001}, .cq={0x0000ffff}, .cr={0x0000ffff,0}},
 419         {.m=3, .n=2, .u={0x000089ab,0x00004567,0x00000123}, .v={0x00000000,0x00000001}, .cq={0x00004567,0x00000123}, .cr={0x000089ab,0}},
 420         {.m=3, .n=2, .u={0x00000000,0x0000fffe,0x00008000}, .v={0x0000ffff,0x00008000}, .cq={0xffffffff,0x00000000}, .cr={0x0000ffff,0x00007fff}}, // Shows that first qhat can = b + 1.
 421         {.m=3, .n=3, .u={0x00000003,0x00000000,0x80000000}, .v={0x00000001,0x00000000,0x20000000}, .cq={0x00000003}, .cr={0,0,0x20000000}}, // Adding back step req'd.
 422         {.m=3, .n=3, .u={0x00000003,0x00000000,0x00008000}, .v={0x00000001,0x00000000,0x00002000}, .cq={0x00000003}, .cr={0,0,0x00002000}}, // Adding back step req'd.
 423         {.m=4, .n=3, .u={0,0,0x00008000,0x00007fff}, .v={1,0,0x00008000}, .cq={0xfffe0000,0}, .cr={0x00020000,0xffffffff,0x00007fff}}, // Add back req'd.
 424         {.m=4, .n=3, .u={0,0x0000fffe,0,0x00008000}, .v={0x0000ffff,0,0x00008000}, .cq={0xffffffff,0}, .cr={0x0000ffff,0xffffffff,0x00007fff}}, // Shows that mult-sub quantity cannot be treated as signed.
 425         {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0x0000ffff,0,0x80000000}, .cq={0x00000000,1}, .cr={0x00000000,0xfffeffff,0x00000000}}, // Shows that mult-sub quantity cannot be treated as signed.
 426         {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0xffffffff,0,0x80000000}, .cq={0xffffffff,0}, .cr={0xffffffff,0xffffffff,0x7fffffff}}, // Shows that mult-sub quantity cannot be treated as signed.
 427         // clang-format on
 428     };
 429     const int ncases = sizeof(test) / sizeof(test[0]);
 430     unsigned q[10], r[10];
 431
 432     printf("divmnu:\n");
 433     for (int i = 0; i < ncases; i++)
 434     {
 435         int m = test[i].m;
 436         int n = test[i].n;
 437         uint32_t *u = test[i].u;
 438         uint32_t *v = test[i].v;
 439         uint32_t *cq = test[i].cq;
 440         uint32_t *cr = test[i].cr;
 441
 442         int f = divmnu(q, r, u, v, m, n);
 443         if (f && !test[i].error)
 444         {
 445             dumpit("Unexpected error return code for dividend u =", m, u);
 446             dumpit("                                 divisor  v =", n, v);
 447             errors = errors + 1;
 448         }
 449         else if (!f && test[i].error)
 450         {
 451             dumpit("Unexpected success return code for dividend u =", m, u);
 452             dumpit("                                 divisor  v =", n, v);
 453             errors = errors + 1;
 454         }
 455
 456         if (!f)
 457             check(q, r, u, v, m, n, cq, cr);
 458     }
 459
 460     printf("%d test failures out of %d cases.\n", errors, ncases);
 461     return 0;
 462 }