From: Gabe Black <gabeblack@google.com>
Date: Tue, 16 May 2017 02:39:51 +0000 (-0700)
Subject: x86: Fix the multiplication microops.
X-Git-Tag: v19.0.0.0~2794
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=c1ec4c4f8c22864e6e6c0d5a6d833f413d3a58d7;p=gem5.git

x86: Fix the multiplication microops.

If the operands were 64 bit, an intermediate calculation could lose a
carry bit. This change rearranges that intermediate calculation if the
operand width is large, and reworks the microop implementation in general
in an attempt to make it easier to understand.

Change-Id: Ib36333f3f2695a33cd9623e43682de22ebd2e7ea
Reviewed-on: https://gem5-review.googlesource.com/3381
Reviewed-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
---

diff --git a/src/arch/x86/isa/microops/regop.isa b/src/arch/x86/isa/microops/regop.isa
index ef0c4cb18..dc5f0affe 100644
--- a/src/arch/x86/isa/microops/regop.isa
+++ b/src/arch/x86/isa/microops/regop.isa
@@ -546,23 +546,42 @@ let {{
     class Mul1s(WrRegOp):
         op_class = 'IntMultOp'
 
+        # Multiply two values Aa and Bb where Aa = A << p + a, then correct for
+        # negative operands.
+        #   Aa * Bb
+        # = (A << p + a) * (B << p + b)
+        # = (A * B) << 2p + (A * b + a * B) << p + a * b
         code = '''
             ProdLow = psrc1 * op2;
-            int halfSize = (dataSize * 8) / 2;
-            uint64_t shifter = (ULL(1) << halfSize);
-            uint64_t hiResult;
-            uint64_t psrc1_h = psrc1 / shifter;
-            uint64_t psrc1_l = psrc1 & mask(halfSize);
-            uint64_t psrc2_h = (op2 / shifter) & mask(halfSize);
-            uint64_t psrc2_l = op2 & mask(halfSize);
-            hiResult = ((psrc1_l * psrc2_h + psrc1_h * psrc2_l +
-                        ((psrc1_l * psrc2_l) / shifter)) /shifter) +
-                       psrc1_h * psrc2_h;
+
+            int p = (dataSize * 8) / 2;
+            uint64_t A = bits(psrc1, 2 * p - 1, p);
+            uint64_t a = bits(psrc1, p - 1, 0);
+            uint64_t B = bits<uint64_t>(op2, 2 * p - 1, p);
+            uint64_t b = bits<uint64_t>(op2, p - 1, 0);
+
+            uint64_t c1, c2; // Carry between place values.
+            uint64_t ab = a * b, Ab = A * b, aB = a * B, AB = A * B;
+
+            c1 = ab >> p;
+
+            // Be careful to avoid overflow if p is large.
+            if (p == 32) {
+                c2 = (c1 >> 1) + (Ab >> 1) + (aB >> 1);
+                c2 += ((c1 & 0x1) + (Ab & 0x1) + (aB & 0x1)) >> 1;
+                c2 >>= (p - 1);
+            } else {
+                c2 = (c1 + Ab + aB) >> p;
+            }
+
+            uint64_t hi = AB + c2;
+
             if (bits(psrc1, dataSize * 8 - 1))
-                hiResult -= op2;
+                hi -= op2;
             if (bits(op2, dataSize * 8 - 1))
-                hiResult -= psrc1;
-            ProdHi = hiResult;
+                hi -= psrc1;
+
+            ProdHi = hi;
             '''
         flag_code = '''
             if ((-ProdHi & mask(dataSize * 8)) !=
@@ -578,17 +597,34 @@ let {{
     class Mul1u(WrRegOp):
         op_class = 'IntMultOp'
 
+        # Multiply two values Aa and Bb where Aa = A << p + a.
+        #   Aa * Bb
+        # = (A << p + a) * (B << p + b)
+        # = (A * B) << 2p + (A * b + a * B) << p + a * b
         code = '''
             ProdLow = psrc1 * op2;
-            int halfSize = (dataSize * 8) / 2;
-            uint64_t shifter = (ULL(1) << halfSize);
-            uint64_t psrc1_h = psrc1 / shifter;
-            uint64_t psrc1_l = psrc1 & mask(halfSize);
-            uint64_t psrc2_h = (op2 / shifter) & mask(halfSize);
-            uint64_t psrc2_l = op2 & mask(halfSize);
-            ProdHi = ((psrc1_l * psrc2_h + psrc1_h * psrc2_l +
-                      ((psrc1_l * psrc2_l) / shifter)) / shifter) +
-                     psrc1_h * psrc2_h;
+
+            int p = (dataSize * 8) / 2;
+            uint64_t A = bits(psrc1, 2 * p - 1, p);
+            uint64_t a = bits(psrc1, p - 1, 0);
+            uint64_t B = bits<uint64_t>(op2, 2 * p - 1, p);
+            uint64_t b = bits<uint64_t>(op2, p - 1, 0);
+
+            uint64_t c1, c2; // Carry between place values.
+            uint64_t ab = a * b, Ab = A * b, aB = a * B, AB = A * B;
+
+            c1 = ab >> p;
+
+            // Be careful to avoid overflow if p is large.
+            if (p == 32) {
+                c2 = (c1 >> 1) + (Ab >> 1) + (aB >> 1);
+                c2 += ((c1 & 0x1) + (Ab & 0x1) + (aB & 0x1)) >> 1;
+                c2 >>= (p - 1);
+            } else {
+                c2 = (c1 + Ab + aB) >> p;
+            }
+
+            ProdHi = AB + c2;
             '''
         flag_code = '''
             if (ProdHi) {