From: Jacob Lifshay <programmerjake@gmail.com>
Date: Tue, 26 Apr 2022 06:18:29 +0000 (-0700)
Subject: switch to using divrem_64_by_32 which follows semantics of proposed 128x64->64 div op
X-Git-Tag: opf_rfc_ls005_v1~2586
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b6044fdb84bea61887999ffa5e17543fa91b8f89;p=libreriscv.git

switch to using divrem_64_by_32 which follows semantics of proposed 128x64->64 div op
---

diff --git a/openpower/sv/biginteger/divmnu64.c b/openpower/sv/biginteger/divmnu64.c
index 0d3f3973a..e193a59ed 100644
--- a/openpower/sv/biginteger/divmnu64.c
+++ b/openpower/sv/biginteger/divmnu64.c
@@ -56,6 +56,24 @@ void dumpit(char *msg, int n, unsigned v[])
     printf("\n");
 }
 
+typedef struct
+{
+    uint32_t q, r;
+    bool overflow;
+} divrem_t;
+
+divrem_t divrem_64_by_32(uint64_t n, uint32_t d)
+{
+    if ((n >> 32) >= d) // overflow
+    {
+        return (divrem_t){.q = UINT32_MAX, .r = 0, .overflow = true};
+    }
+    else
+    {
+        return (divrem_t){.q = n / d, .r = n % d, .overflow = false};
+    }
+}
+
 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
 (The sequence is in little-endian order).
 
@@ -126,17 +144,19 @@ int divmnu(unsigned q[], unsigned r[], const unsigned u[], const unsigned v[],
     for (j = m - n; j >= 0; j--)
     { // Main loop.
         // Compute estimate qhat of q[j] from top 2 digits.
-        // do as 2 separate divs to demo that 64/32 div/rem would
-        // be perfectly fine.
-        uint64_t dig1 = (uint64_t)un[j + n];
-        qhat = dig1 / vn[n - 1];
-        rhat = dig1 % vn[n - 1];
-        uint64_t dig2 = ((uint64_t)rhat << 32) | un[j + n - 1];
-        qhat = dig2 / vn[n - 1] | (qhat << 32);
-        rhat = dig2 % vn[n - 1] | (rhat << 32);
+        uint64_t dig2 = ((uint64_t)un[j + n] << 32) | un[j + n - 1];
+        divrem_t qr = divrem_64_by_32(dig2, vn[n - 1]);
+        qhat = qr.q;
+        rhat = qr.r;
+        if (qr.overflow)
+        {
+            // rhat can be bigger than 32-bit when the division overflows,
+            // so rhat's computation can't be folded into divrem_64_by_32
+            rhat = dig2 - qr.q * vn[n - 1];
+        }
     again:
         // use 3rd-from-top digit to obtain better accuracy
-        if (qhat >= b || qhat * vn[n - 2] > b * rhat + un[j + n - 2])
+        if (rhat < b && qhat * vn[n - 2] > b * rhat + un[j + n - 2])
         {
             qhat = qhat - 1;
             rhat = rhat + vn[n - 1];