printf("\n");
}
+typedef struct
+{
+ uint32_t q, r;
+ bool overflow;
+} divrem_t;
+
+divrem_t divrem_64_by_32(uint64_t n, uint32_t d)
+{
+ if ((n >> 32) >= d) // overflow
+ {
+ return (divrem_t){.q = UINT32_MAX, .r = 0, .overflow = true};
+ }
+ else
+ {
+ return (divrem_t){.q = n / d, .r = n % d, .overflow = false};
+ }
+}
+
/* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
(The sequence is in little-endian order).
for (j = m - n; j >= 0; j--)
{ // Main loop.
// Compute estimate qhat of q[j] from top 2 digits.
- // do as 2 separate divs to demo that 64/32 div/rem would
- // be perfectly fine.
- uint64_t dig1 = (uint64_t)un[j + n];
- qhat = dig1 / vn[n - 1];
- rhat = dig1 % vn[n - 1];
- uint64_t dig2 = ((uint64_t)rhat << 32) | un[j + n - 1];
- qhat = dig2 / vn[n - 1] | (qhat << 32);
- rhat = dig2 % vn[n - 1] | (rhat << 32);
+ uint64_t dig2 = ((uint64_t)un[j + n] << 32) | un[j + n - 1];
+ divrem_t qr = divrem_64_by_32(dig2, vn[n - 1]);
+ qhat = qr.q;
+ rhat = qr.r;
+ if (qr.overflow)
+ {
+ // rhat can be bigger than 32-bit when the division overflows,
+ // so rhat's computation can't be folded into divrem_64_by_32
+ rhat = dig2 - qr.q * vn[n - 1];
+ }
again:
// use 3rd-from-top digit to obtain better accuracy
- if (qhat >= b || qhat * vn[n - 2] > b * rhat + un[j + n - 2])
+ if (rhat < b && qhat * vn[n - 2] > b * rhat + un[j + n - 2])
{
qhat = qhat - 1;
rhat = rhat + vn[n - 1];