From fa1a4f6352f673c4f2d8d1785bd7be2d1fb592bc Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 7 Jun 2018 10:54:51 +0530
Subject: [PATCH] arch-power: Fix fixed-point arithmetic multiply and divide
 instructions

This fixes the following arithmetic instructions:
  * Multiply Low Immediate (mulli)
  * Multiply Low Word (mullw[o][.])
  * Multiply High Word (mulhw[.])
  * Multiply High Word Unsigned (mulhwu[.])
  * Divide Word (divw[o][.])
  * Divide Word Unsigned (divwu[o][.])

This also fixes disassembly generation for all of the above.

Change-Id: I46fd3751b86a7436a962f8b93f26d8343f215fed
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 src/arch/power/isa/decoder.isa         |  63 ++++++---------
 src/arch/power/isa/formats/integer.isa | 105 +++++++++++++++++--------
 2 files changed, 93 insertions(+), 75 deletions(-)

diff --git a/src/arch/power/isa/decoder.isa b/src/arch/power/isa/decoder.isa
index 765408a67..456c6be01 100644
--- a/src/arch/power/isa/decoder.isa
+++ b/src/arch/power/isa/decoder.isa
@@ -216,9 +216,8 @@ decode PO default Unknown::unknown() {
         true);
 
         7: mulli({{
-            int32_t src = Ra_sw;
-            int64_t prod = src * simm;
-            Rt = (uint32_t)prod;
+            int64_t res = Ra_sd * simm;
+            Rt = res;
         }});
     }
 
@@ -564,65 +563,47 @@ decode PO default Unknown::unknown() {
             // with destination register Rt.
             format IntArithOp {
                 75: mulhw({{
-                    int64_t prod = Ra_sd * Rb_sd;
-                    Rt = prod >> 32;
+                    uint64_t res = (int64_t)Ra_sw * Rb_sw;
+                    res = res >> 32;
+                    Rt = res;
                 }});
+
                 11: mulhwu({{
-                    uint64_t prod = Ra_ud * Rb_ud;
-                    Rt = prod >> 32;
+                    uint64_t res = (uint64_t)Ra_uw * Rb_uw;
+                    res = res >> 32;
+                    Rt = res;
                 }});
-                235: mullw({{ int64_t prod = Ra_sd * Rb_sd; Rt = prod; }});
-                747: mullwo({{
-                    int64_t src1 = Ra_sd;
-                    int64_t src2 = Rb;
-                    int64_t prod = src1 * src2;
-                    Rt = prod;
+
+                235: mullw({{
+                    int64_t res = (int64_t)Ra_sw * Rb_sw;
+                    if (res != (int32_t)res) {
+                        setOV = true;
+                    }
+                    Rt = res;
                 }},
                 true);
 
                 491: divw({{
                     int32_t src1 = Ra_sw;
                     int32_t src2 = Rb_sw;
-                    if ((src1 != 0x80000000 || src2 != 0xffffffff)
-                        && src2 != 0) {
-                        Rt = src1 / src2;
-                    } else {
-                        Rt = 0;
-                    }
-                }});
-
-                1003: divwo({{
-                    int32_t src1 = Ra_sw;
-                    int32_t src2 = Rb_sw;
-                    if ((src1 != 0x80000000 || src2 != 0xffffffff)
-                        && src2 != 0) {
-                        Rt = src1 / src2;
+                    if ((src1 != INT32_MIN || src2 != -1) && src2 != 0) {
+                        Rt = (uint32_t)(src1 / src2);
                     } else {
                         Rt = 0;
-                        divSetOV = true;
+                        setOV = true;
                     }
                 }},
                 true);
 
                 459: divwu({{
-                    uint32_t src1 = Ra_sw;
-                    uint32_t src2 = Rb_sw;
+                    uint32_t src1 = Ra_uw;
+                    uint32_t src2 = Rb_uw;
                     if (src2 != 0) {
                         Rt = src1 / src2;
                     } else {
                         Rt = 0;
+                        setOV = true;
                     }
-                }});
-
-                971: divwuo({{
-                  uint32_t src1 = Ra_sw;
-                  uint32_t src2 = Rb_sw;
-                  if (src2 != 0) {
-                      Rt = src1 / src2;
-                  } else {
-                      Rt = 0;
-                      divSetOV = true;
-                  }
                 }},
                 true);
             }
diff --git a/src/arch/power/isa/formats/integer.isa b/src/arch/power/isa/formats/integer.isa
index eac6db29b..a21deab56 100644
--- a/src/arch/power/isa/formats/integer.isa
+++ b/src/arch/power/isa/formats/integer.isa
@@ -106,17 +106,24 @@ computeOVCode = '''
     }
 '''
 
-computeDivOVCode = '''
-    if (divSetOV) {
+setCACode = '''
+    if (setCA) {
+        xer.ca = 1;
+        xer.ca32 = 1;
+    } else {
+        xer.ca = 0;
+        xer.ca32 = 0;
+    }
+'''
+
+setOVCode = '''
+    if (setOV) {
         xer.ov = 1;
+        xer.ov32 = 1;
         xer.so = 1;
     } else {
-        if (findOverflow(32, %(result)s, %(inputa)s, %(inputb)s)) {
-            xer.ov = 1;
-            xer.so = 1;
-        } else {
-            xer.ov = 0;
-        }
+        xer.ov = 0;
+        xer.ov32 = 0;
     }
 '''
 
@@ -319,10 +326,14 @@ def format IntSumOp(src1, src2, ca = {{ 0 }}, computeCA = 0,
 
 // Instructions that use source registers Ra and Rb, with the result
 // placed into Rt. Basically multiply and divide instructions. The
-// carry bit is never set, but overflow can be calculated. Division
-// explicitly sets the overflow bit in certain situations and this is
-// dealt with using the 'divSetOV' boolean in decoder.isa. We generate
-// two versions of each instruction to deal with the Rc bit.
+// carry bit is never set, but overflow can be calculated. In certain
+// situations, the overflow bits have to be set and this is dealt with
+// using the 'setOV' boolean in decoder.isa.
+//
+// In case overflow is to be calculated, we generate four versions of
+// each instruction to deal with different combinations of having the
+// OE bit set or unset and the Rc bit set or unset too. Otherwise, we
+// generate two versions of each instruction to deal with the Rc bit.
 def format IntArithOp(code, computeOV = 0, inst_flags = []) {{
 
     # The result is always in Rt, but the source values vary
@@ -330,28 +341,54 @@ def format IntArithOp(code, computeOV = 0, inst_flags = []) {{
 
     # Deal with setting the overflow flag
     if computeOV:
-        code = 'bool divSetOV = false;\n' + code
-        code += computeDivOVCode % dict + setXERCode
-
-    # Setup the 2 code versions and add code to access XER if necessary
-    code_rc1 = readXERCode + code + computeCR0Code % dict
-    if computeOV:
-        code = readXERCode + code
-
-    # Generate the classes
-    (header_output, decoder_output, decode_block, exec_output) = \
-        GenAluOp(name, Name, 'IntOp', code, inst_flags,
-                 CheckRcDecode, BasicConstructor)
-
-    # Generate the second class
-    (header_output_rc1, decoder_output_rc1, _, exec_output_rc1) = \
-        GenAluOp(name, Name + 'RcSet', 'IntOp', code_rc1, inst_flags,
-                 CheckRcDecode, IntRcConstructor)
-
-    # Finally, add to the other outputs
-    header_output += header_output_rc1
-    decoder_output += decoder_output_rc1
-    exec_output += exec_output_rc1
+        # Setup the 4 code versions and add code to access XER if necessary
+        code  = 'bool setOV M5_VAR_USED = false;\n' + code
+        code_rc1 = readXERCode + code + computeCR0Code % dict
+        code_oe1 = readXERCode + code + setOVCode + setXERCode
+        code_rc1_oe1 = readXERCode + code + setOVCode + setXERCode
+        code_rc1_oe1 += computeCR0Code % dict
+
+        # Generate the classes
+        (header_output, decoder_output, decode_block, exec_output) = \
+            GenAluOp(name, Name, 'IntArithOp', code, inst_flags,
+                     CheckRcOeDecode, BasicConstructor)
+        (header_output_rc1, decoder_output_rc1, _, exec_output_rc1) = \
+            GenAluOp(name, Name + 'RcSet', 'IntArithOp', code_rc1, inst_flags,
+                     CheckRcOeDecode, IntRcConstructor)
+        (header_output_oe1, decoder_output_oe1, _, exec_output_oe1) = \
+            GenAluOp(name, Name + 'OeSet', 'IntArithOp', code_oe1, inst_flags,
+                     CheckRcOeDecode, IntOeConstructor)
+        (header_output_rc1_oe1, decoder_output_rc1_oe1, _,
+         exec_output_rc1_oe1) = \
+            GenAluOp(name, Name + 'RcSetOeSet', 'IntArithOp', code_rc1_oe1,
+                     inst_flags, CheckRcOeDecode, IntRcOeConstructor)
+
+        # Finally, add to the other outputs
+        header_output += \
+            header_output_rc1 + header_output_oe1 + header_output_rc1_oe1
+        decoder_output += \
+            decoder_output_rc1 + decoder_output_oe1 + decoder_output_rc1_oe1
+        exec_output += \
+            exec_output_rc1 + exec_output_oe1 + exec_output_rc1_oe1
+
+    else:
+        # Setup the 2 code versions and add code to access XER if necessary
+        code_rc1 = readXERCode + code + computeCR0Code % dict
+
+        # Generate the first class
+        (header_output, decoder_output, decode_block, exec_output) = \
+            GenAluOp(name, Name, 'IntArithOp', code, inst_flags,
+                     CheckRcDecode, BasicConstructor)
+
+        # Generate the second class
+        (header_output_rc1, decoder_output_rc1, _, exec_output_rc1) = \
+            GenAluOp(name, Name + 'RcSet', 'IntArithOp', code_rc1, inst_flags,
+                     CheckRcDecode, IntRcConstructor)
+
+        # Finally, add to the other outputs
+        header_output += header_output_rc1
+        decoder_output += decoder_output_rc1
+        exec_output += exec_output_rc1
 }};
 
 
-- 
2.30.2