From 5bc9709bc9496e19eb66907029abd1898ceaf177 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Sun, 24 Apr 2022 18:06:18 +0100
Subject: [PATCH] morphing divmnu64.c into big* functions then attempt
 goldschmidt

---
 openpower/sv/biginteger/divgnu64.c | 348 +++++++++++++++++++++++++++++
 1 file changed, 348 insertions(+)
 create mode 100644 openpower/sv/biginteger/divgnu64.c

diff --git a/openpower/sv/biginteger/divgnu64.c b/openpower/sv/biginteger/divgnu64.c
new file mode 100644
index 000000000..e40831fc7
--- /dev/null
+++ b/openpower/sv/biginteger/divgnu64.c
@@ -0,0 +1,348 @@
+/* original source code from Hackers-Delight
+   https://github.com/hcs0/Hackers-Delight
+*/
+/* This divides an n-word dividend by an m-word divisor, giving an
+n-m+1-word quotient and m-word remainder. The bignums are in arrays of
+words. Here a "word" is 32 bits. This routine is designed for a 64-bit
+machine which has a 64/64 division instruction. */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h> //To define "exit", req'd by XLC.
+
+#define max(x, y) ((x) > (y) ? (x) : (y))
+
+int nlz(unsigned x)
+{
+    int n;
+
+    if (x == 0)
+        return (32);
+    n = 0;
+    if (x <= 0x0000FFFF)
+    {
+        n = n + 16;
+        x = x << 16;
+    }
+    if (x <= 0x00FFFFFF)
+    {
+        n = n + 8;
+        x = x << 8;
+    }
+    if (x <= 0x0FFFFFFF)
+    {
+        n = n + 4;
+        x = x << 4;
+    }
+    if (x <= 0x3FFFFFFF)
+    {
+        n = n + 2;
+        x = x << 2;
+    }
+    if (x <= 0x7FFFFFFF)
+    {
+        n = n + 1;
+    }
+    return n;
+}
+
+void dumpit(char *msg, int n, unsigned v[])
+{
+    int i;
+    printf("%s", msg);
+    for (i = n - 1; i >= 0; i--)
+        printf(" %08x", v[i]);
+    printf("\n");
+}
+
+bool bigmul(unsigned long long qhat, unsigned vn[], unsigned un[], 
+           int j, int m, int n)
+{
+    long long t, k;
+    int s, i;
+    // Multiply and subtract.
+    uint32_t carry = 0;
+    uint32_t product[n + 1];
+    // VL = n + 1
+    // sv.madded product.v, vn.v, qhat.s, carry.s
+    for (int i = 0; i <= n; i++)
+    {
+        uint32_t vn_v = i < n ? vn[i] : 0;
+        uint64_t value = (uint64_t)vn_v * (uint64_t)qhat + carry;
+        carry = (uint32_t)(value >> 32);
+        product[i] = (uint32_t)value;
+    }
+    return carry != 0;
+}
+
+bool bigsub(unsigned long long qhat, unsigned vn[], unsigned un[], 
+           int j, int m, int n, bool ca)
+{
+    // VL = n + 1
+    // sv.subfe un_j.v, product.v, un_j.v
+    for (int i = 0; i <= n; i++)
+    {
+        uint64_t value = (uint64_t)~vn[i] + (uint64_t)un[i] + ca;
+        ca = value >> 32 != 0;
+        un[i] = value;
+    }
+    bool need_fixup = !ca;
+    return need_fixup;
+}
+
+bool bigmulsub(unsigned long long qhat, unsigned vn[], unsigned un[], 
+           int j, int m, int n)
+{
+    unsigned long long p;                 // Product of two digits.
+    long long t, k;
+    int s, i;
+    // Multiply and subtract.
+    uint32_t carry = 0;
+    uint32_t product[n + 1];
+    // VL = n + 1
+    // sv.madded product.v, vn.v, qhat.s, carry.s
+    for (int i = 0; i <= n; i++)
+    {
+        uint32_t vn_v = i < n ? vn[i] : 0;
+        uint64_t value = (uint64_t)vn_v * (uint64_t)qhat + carry;
+        carry = (uint32_t)(value >> 32);
+        product[i] = (uint32_t)value;
+    }
+    bool ca = true;
+    uint32_t *un_j = &un[j];
+    // VL = n + 1
+    // sv.subfe un_j.v, product.v, un_j.v
+    for (int i = 0; i <= n; i++)
+    {
+        uint64_t value = (uint64_t)~product[i] + (uint64_t)un_j[i] + ca;
+        ca = value >> 32 != 0;
+        un_j[i] = value;
+    }
+    bool need_fixup = !ca;
+    return need_fixup;
+}
+
+/* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
+(The sequence is in little-endian order).
+
+This is a fairly precise implementation of Knuth's Algorithm D, for a
+binary computer with base b = 2**32. The caller supplies:
+   1. Space q for the quotient, m - n + 1 words (at least one).
+   2. Space r for the remainder (optional), n words.
+   3. The dividend u, m words, m >= 1.
+   4. The divisor v, n words, n >= 2.
+The most significant digit of the divisor, v[n-1], must be nonzero.  The
+dividend u may have leading zeros; this just makes the algorithm take
+longer and makes the quotient contain more leading zeros.  A value of
+NULL may be given for the address of the remainder to signify that the
+caller does not want the remainder.
+   The program does not alter the input parameters u and v.
+   The quotient and remainder returned may have leading zeros.  The
+function itself returns a value of 0 for success and 1 for invalid
+parameters (e.g., division by 0).
+   For now, we must have m >= n.  Knuth's Algorithm D also requires
+that the dividend be at least as long as the divisor.  (In his terms,
+m >= 0 (unstated).  Therefore m+n >= n.) */
+
+int divmnu(unsigned q[], unsigned r[], const unsigned u[], const unsigned v[],
+           int m, int n)
+{
+
+    const unsigned long long b = 1LL<<32; // Number base (2**32).
+    unsigned *un, *vn;                    // Normalized form of u, v.
+    unsigned long long qhat;              // Estimated quotient digit.
+    unsigned long long rhat;              // A remainder.
+    unsigned long long p;                 // Product of two digits.
+    long long t, k;
+    int s, i, j;
+
+    if (m < n || n <= 0 || v[n - 1] == 0)
+        return 1; // Return if invalid param.
+
+    if (n == 1)
+    {          // Take care of
+        k = 0; // the case of a
+        for (j = m - 1; j >= 0; j--)
+        {                                 // single-digit
+            q[j] = (k * b + u[j]) / v[0]; // divisor here.
+            k = (k * b + u[j]) - q[j] * v[0];
+        }
+        if (r != NULL)
+            r[0] = k;
+        return 0;
+    }
+
+    /* Normalize by shifting v left just enough so that its high-order
+    bit is on, and shift u left the same amount. We may have to append a
+    high-order digit on the dividend; we do that unconditionally. */
+
+    s = nlz(v[n - 1]); // 0 <= s <= 31.
+    vn = (unsigned *)alloca(4 * n);
+    for (i = n - 1; i > 0; i--)
+        vn[i] = (v[i] << s) | ((unsigned long long)v[i - 1] >> (32 - s));
+    vn[0] = v[0] << s;
+
+    un = (unsigned *)alloca(4 * (m + 1));
+    un[m] = (unsigned long long)u[m - 1] >> (32 - s);
+    for (i = m - 1; i > 0; i--)
+        un[i] = (u[i] << s) | ((unsigned long long)u[i - 1] >> (32 - s));
+    un[0] = u[0] << s;
+
+    for (j = m - n; j >= 0; j--)
+    { // Main loop.
+        // Compute estimate qhat of q[j] from top 2 digits
+        uint64_t dig2 = ((uint64_t)un[j + n] << 32) | un[j + n - 1];
+        qhat = dig2 / vn[n - 1];
+        rhat = dig2 % vn[n - 1];
+    again:
+        // use 3rd-from-top digit to obtain better accuracy
+        if (qhat >= b || qhat * vn[n - 2] > b * rhat + un[j + n - 2])
+        {
+            qhat = qhat - 1;
+            rhat = rhat + vn[n - 1];
+            if (rhat < b)
+                goto again;
+        }
+
+        bool need_fixup = bigmulsub(qhat, vn, un, j, m, n);
+
+        q[j] = qhat; // Store quotient digit.
+        if (need_fixup)
+        {                    // If we subtracted too
+            q[j] = q[j] - 1; // much, add back.
+            k = 0;
+            for (i = 0; i < n; i++)
+            {
+                t = (unsigned long long)un[i + j] + vn[i] + k;
+                un[i + j] = t;
+                k = t >> 32;
+            }
+            un[j + n] = un[j + n] + k;
+        }
+    } // End j.
+    // If the caller wants the remainder, unnormalize
+    // it and pass it back.
+    if (r != NULL)
+    {
+        for (i = 0; i < n - 1; i++)
+            r[i] = (un[i] >> s) | ((unsigned long long)un[i + 1] << (32 - s));
+        r[n - 1] = un[n - 1] >> s;
+    }
+    return 0;
+}
+
+int errors;
+
+void check(unsigned q[], unsigned r[], unsigned u[], unsigned v[], int m,
+           int n, unsigned cq[], unsigned cr[])
+{
+    int i, szq;
+
+    szq = max(m - n + 1, 1);
+    for (i = 0; i < szq; i++)
+    {
+        if (q[i] != cq[i])
+        {
+            errors = errors + 1;
+            dumpit("Error, dividend u =", m, u);
+            dumpit("       divisor  v =", n, v);
+            dumpit("For quotient,  got:", m - n + 1, q);
+            dumpit("        Should get:", m - n + 1, cq);
+            return;
+        }
+    }
+    for (i = 0; i < n; i++)
+    {
+        if (r[i] != cr[i])
+        {
+            errors = errors + 1;
+            dumpit("Error, dividend u =", m, u);
+            dumpit("       divisor  v =", n, v);
+            dumpit("For remainder, got:", n, r);
+            dumpit("        Should get:", n, cr);
+            return;
+        }
+    }
+    return;
+}
+
+int main()
+{
+    static struct
+    {
+        int m, n;
+        uint32_t u[10], v[10], cq[10], cr[10];
+        bool error;
+    } test[] = {
+        // clang-format off
+        {.m=1, .n=1, .u={3}, .v={0}, .cq={1}, .cr={1}, .error=true}, // Error, divide by 0.
+        {.m=1, .n=2, .u={7}, .v={1,3}, .cq={0}, .cr={7,0}, .error=true}, // Error, n > m.
+        {.m=2, .n=2, .u={0,0}, .v={1,0}, .cq={0}, .cr={0,0}, .error=true}, // Error, incorrect remainder cr.
+        {.m=1, .n=1, .u={3}, .v={2}, .cq={1}, .cr={1}},
+        {.m=1, .n=1, .u={3}, .v={3}, .cq={1}, .cr={0}},
+        {.m=1, .n=1, .u={3}, .v={4}, .cq={0}, .cr={3}},
+        {.m=1, .n=1, .u={0}, .v={0xffffffff}, .cq={0}, .cr={0}},
+        {.m=1, .n=1, .u={0xffffffff}, .v={1}, .cq={0xffffffff}, .cr={0}},
+        {.m=1, .n=1, .u={0xffffffff}, .v={0xffffffff}, .cq={1}, .cr={0}},
+        {.m=1, .n=1, .u={0xffffffff}, .v={3}, .cq={0x55555555}, .cr={0}},
+        {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={1}, .cq={0xffffffff,0xffffffff}, .cr={0}},
+        {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={0xffffffff}, .cq={1,1}, .cr={0}},
+        {.m=2, .n=1, .u={0xffffffff,0xfffffffe}, .v={0xffffffff}, .cq={0xffffffff,0}, .cr={0xfffffffe}},
+        {.m=2, .n=1, .u={0x00005678,0x00001234}, .v={0x00009abc}, .cq={0x1e1dba76,0}, .cr={0x6bd0}},
+        {.m=2, .n=2, .u={0,0}, .v={0,1}, .cq={0}, .cr={0,0}},
+        {.m=2, .n=2, .u={0,7}, .v={0,3}, .cq={2}, .cr={0,1}},
+        {.m=2, .n=2, .u={5,7}, .v={0,3}, .cq={2}, .cr={5,1}},
+        {.m=2, .n=2, .u={0,6}, .v={0,2}, .cq={3}, .cr={0,0}},
+        {.m=1, .n=1, .u={0x80000000}, .v={0x40000001}, .cq={0x00000001}, .cr={0x3fffffff}},
+        {.m=2, .n=1, .u={0x00000000,0x80000000}, .v={0x40000001}, .cq={0xfffffff8,0x00000001}, .cr={0x00000008}},
+        {.m=2, .n=2, .u={0x00000000,0x80000000}, .v={0x00000001,0x40000000}, .cq={0x00000001}, .cr={0xffffffff,0x3fffffff}},
+        {.m=2, .n=2, .u={0x0000789a,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={0,0}},
+        {.m=2, .n=2, .u={0x0000789b,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={1,0}},
+        {.m=2, .n=2, .u={0x00007899,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={0}, .cr={0x00007899,0x0000bcde}},
+        {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x0000ffff,0x0000ffff}, .cq={1}, .cr={0,0}},
+        {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x00000000,0x00000001}, .cq={0x0000ffff}, .cr={0x0000ffff,0}},
+        {.m=3, .n=2, .u={0x000089ab,0x00004567,0x00000123}, .v={0x00000000,0x00000001}, .cq={0x00004567,0x00000123}, .cr={0x000089ab,0}},
+        {.m=3, .n=2, .u={0x00000000,0x0000fffe,0x00008000}, .v={0x0000ffff,0x00008000}, .cq={0xffffffff,0x00000000}, .cr={0x0000ffff,0x00007fff}}, // Shows that first qhat can = b + 1.
+        {.m=3, .n=3, .u={0x00000003,0x00000000,0x80000000}, .v={0x00000001,0x00000000,0x20000000}, .cq={0x00000003}, .cr={0,0,0x20000000}}, // Adding back step req'd.
+        {.m=3, .n=3, .u={0x00000003,0x00000000,0x00008000}, .v={0x00000001,0x00000000,0x00002000}, .cq={0x00000003}, .cr={0,0,0x00002000}}, // Adding back step req'd.
+        {.m=4, .n=3, .u={0,0,0x00008000,0x00007fff}, .v={1,0,0x00008000}, .cq={0xfffe0000,0}, .cr={0x00020000,0xffffffff,0x00007fff}}, // Add back req'd.
+        {.m=4, .n=3, .u={0,0x0000fffe,0,0x00008000}, .v={0x0000ffff,0,0x00008000}, .cq={0xffffffff,0}, .cr={0x0000ffff,0xffffffff,0x00007fff}}, // Shows that mult-sub quantity cannot be treated as signed.
+        {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0x0000ffff,0,0x80000000}, .cq={0x00000000,1}, .cr={0x00000000,0xfffeffff,0x00000000}}, // Shows that mult-sub quantity cannot be treated as signed.
+        {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0xffffffff,0,0x80000000}, .cq={0xffffffff,0}, .cr={0xffffffff,0xffffffff,0x7fffffff}}, // Shows that mult-sub quantity cannot be treated as signed.
+        // clang-format on
+    };
+    const int ncases = sizeof(test) / sizeof(test[0]);
+    unsigned q[10], r[10];
+
+    printf("divmnu:\n");
+    for (int i = 0; i < ncases; i++)
+    {
+        int m = test[i].m;
+        int n = test[i].n;
+        uint32_t *u = test[i].u;
+        uint32_t *v = test[i].v;
+        uint32_t *cq = test[i].cq;
+        uint32_t *cr = test[i].cr;
+
+        int f = divmnu(q, r, u, v, m, n);
+        if (f && !test[i].error)
+        {
+            dumpit("Unexpected error return code for dividend u =", m, u);
+            dumpit("                                 divisor  v =", n, v);
+            errors = errors + 1;
+        }
+        else if (!f && test[i].error)
+        {
+            dumpit("Unexpected success return code for dividend u =", m, u);
+            dumpit("                                 divisor  v =", n, v);
+            errors = errors + 1;
+        }
+
+        if (!f)
+            check(q, r, u, v, m, n, cq, cr);
+    }
+
+    printf("%d test failures out of %d cases.\n", errors, ncases);
+    return 0;
+}
-- 
2.30.2