From 31dcf687d8c99c06f6015ffdb6e69d6ac804975d Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos.margaritis@vectorcamp.gr>
Date: Thu, 4 May 2023 14:11:31 +0000
Subject: [PATCH] Add 2 more instructions to help with 2-coeff butterfly
 fdct_round_shift(a*c1 +/- b*c2) They are to be used complementary to
 maddsubrs, so one can now do this calculation in 3 instructions. Added some
 unit tests to demonstrate the operation.

---
 openpower/isa/butterfly.mdwn              | 56 +++++++++++++++
 openpower/isatables/RM-1P-2S1D.csv        |  1 -
 openpower/isatables/minor_22.csv          |  4 +-
 src/openpower/decoder/isa/caller.py       |  2 +-
 src/openpower/decoder/power_decoder2.py   |  1 +
 src/openpower/decoder/power_enums.py      |  6 +-
 src/openpower/test/alu/maddsubrs_cases.py | 87 +++++++++++++++++------
 7 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/openpower/isa/butterfly.mdwn b/openpower/isa/butterfly.mdwn
index 43c5c48f..ec106793 100644
--- a/openpower/isa/butterfly.mdwn
+++ b/openpower/isa/butterfly.mdwn
@@ -38,3 +38,59 @@ Pseudo-code:
 Special Registers Altered:
 
     None
+
+# [DRAFT] Integer Butterfly Multiply Add and Accumulate FFT/DCT
+
+A-Form
+
+* maddrs  RT,RA,SH,RB
+
+Pseudo-code:
+
+    n <- SH
+    prod <- MULS(RB, RA)
+    prod_lo <- prod[XLEN:(XLEN*2)-1]
+    if n = 0 then
+        RT <- (RT) + prod_lo
+    else
+        res <- (RT) + prod_lo
+        round <- [0]*XLEN
+        round[XLEN -n] <- 1
+        res <- res + round
+        signbit <- res[0]
+        m <- MASK(n, (XLEN-1))
+        res <- ROTL64(res, XLEN-n) & m
+        smask <- ([signbit]*XLEN) & Â¬m
+        RT <- (res | smask)
+
+Special Registers Altered:
+
+    None
+
+# [DRAFT] Integer Butterfly Multiply Subtract From FFT/DCT 
+
+A-Form
+
+* msubrs  RT,RA,SH,RB
+
+Pseudo-code:
+
+    n <- SH
+    prod <- MULS(RB, RA)
+    prod_lo <- prod[XLEN:(XLEN*2)-1]
+    if n = 0 then
+        RT <- (RT) - prod_lo
+    else
+        res <- (RT) - prod_lo
+        round <- [0]*XLEN
+        round[XLEN -n] <- 1
+        res <- res + round
+        signbit <- res[0]
+        m <- MASK(n, (XLEN-1))
+        res <- ROTL64(res, XLEN-n) & m
+        smask <- ([signbit]*XLEN) & Â¬m
+        RT <- (res | smask)
+
+Special Registers Altered:
+
+    None
diff --git a/openpower/isatables/RM-1P-2S1D.csv b/openpower/isatables/RM-1P-2S1D.csv
index 05b756f5..4b91a4d2 100644
--- a/openpower/isatables/RM-1P-2S1D.csv
+++ b/openpower/isatables/RM-1P-2S1D.csv
@@ -121,5 +121,4 @@ fremainder,NORMAL,,1P,EXTRA3,NO,d:FRT;d:CR1,s:FRA,s:FRB,0,FRA,FRB,0,FRT,0,CR1,0
 fpowr,NORMAL,,1P,EXTRA3,NO,d:FRT;d:CR1,s:FRA,s:FRB,0,FRA,FRB,0,FRT,0,CR1,0
 fpow,NORMAL,,1P,EXTRA3,NO,d:FRT;d:CR1,s:FRA,s:FRB,0,FRA,FRB,0,FRT,0,CR1,0
 rlwimi,NORMAL,,1P,EXTRA3,NO,d:RA;d:CR0,s:RA,s:RS,0,RA,0,RS,RA,0,CR0,0
-maddsubrs,NORMAL,,1P,EXTRA3,NO,TODO,0,0,0,RA,0,RB,RT,0,CR0,0
 rldimi,NORMAL,,1P,EXTRA3,NO,d:RA;d:CR0,s:RA,s:RS,0,RA,0,RS,RA,0,CR0,0
diff --git a/openpower/isatables/minor_22.csv b/openpower/isatables/minor_22.csv
index a918a16a..1ba448b7 100644
--- a/openpower/isatables/minor_22.csv
+++ b/openpower/isatables/minor_22.csv
@@ -40,4 +40,6 @@ opcode,unit,internal op,in1,in2,in3,out,CR in,CR out,inv A,inv out,cry in,cry ou
 ------10001,ALU,OP_BMASK,RA,RB,NONE,RT,NONE,NONE,0,0,ZERO,0,NONE,0,0,0,0,0,0,NONE,0,0,bmask,BM2,,1,unofficial until submitted and approved/renumbered by the opf isa wg
 -----00011-,ALU,OP_FMVIS,NONE,CONST_UI,NONE,FRS,NONE,NONE,0,0,ZERO,0,NONE,0,0,0,0,0,0,NONE,0,0,fmvis,DX,,1,unofficial until submitted and approved/renumbered by the opf isa wg
 -----01011-,ALU,OP_FISHMV,FRS,CONST_UI,NONE,FRS,NONE,NONE,0,0,ZERO,0,NONE,0,0,0,0,0,0,NONE,0,0,fishmv,DX,,1,unofficial until submitted and approved/renumbered by the opf isa wg
-------01000,ALU,OP_MADDSUBRS,RA,CONST_SH,RB,RT,NONE,CR0,0,0,ZERO,0,NONE,0,0,0,0,1,0,RC_ONLY,0,0,maddsubrs,A,,1,unofficial until submitted and approved/renumbered by the opf isa wg
+------01000,ALU,OP_MADDSUBRS,RA,CONST_SH,RB,RT,NONE,NONE,0,0,ZERO,0,NONE,0,0,0,0,1,0,NONE,0,0,maddsubrs,A,,1,unofficial until submitted and approved/renumbered by the opf isa wg
+------01001,ALU,OP_MADDRS,RA,CONST_SH,RB,RT,NONE,NONE,0,0,ZERO,0,NONE,0,0,0,0,1,0,NONE,0,0,maddrs,A,,1,unofficial until submitted and approved/renumbered by the opf isa wg
+------01010,ALU,OP_MSUBRS,RA,CONST_SH,RB,RT,NONE,NONE,0,0,ZERO,0,NONE,0,0,0,0,1,0,NONE,0,0,msubrs,A,,1,unofficial until submitted and approved/renumbered by the opf isa wg
diff --git a/src/openpower/decoder/isa/caller.py b/src/openpower/decoder/isa/caller.py
index f5657518..101a2ad0 100644
--- a/src/openpower/decoder/isa/caller.py
+++ b/src/openpower/decoder/isa/caller.py
@@ -1920,7 +1920,7 @@ class ISACaller(ISACallerHelper, ISAFPHelpers, StepLoop):
                        "fmvtg", "fmvtgs",
                        "fcvtfg", "fcvtfgs",
                        "fmvfg", "fmvfgs",
-                       "maddsubrs"
+                       "maddsubrs", "maddrs", "msubrs"
                        ]:
             illegal = False
             ins_name = dotstrp
diff --git a/src/openpower/decoder/power_decoder2.py b/src/openpower/decoder/power_decoder2.py
index a243d825..2e06b5d7 100644
--- a/src/openpower/decoder/power_decoder2.py
+++ b/src/openpower/decoder/power_decoder2.py
@@ -1082,6 +1082,7 @@ class PowerDecodeSubset(Elaboratable):
             # implicit RS for major 22, integer maddsubrs
             with m.If((major == 22) & xo6.matches(
                     '-01000',  # maddsubrs
+                    '-01001',  # maddrs
                 )):
                 comb += self.implicit_rs.eq(1)
                 comb += self.extend_rb_maxvl.eq(1) # extend RB
diff --git a/src/openpower/decoder/power_enums.py b/src/openpower/decoder/power_enums.py
index a7b1f5b4..ebd23321 100644
--- a/src/openpower/decoder/power_enums.py
+++ b/src/openpower/decoder/power_enums.py
@@ -756,7 +756,9 @@ _insns = [
     # "lwzbr", "lwzubr", # more load word SVP64 bit-reversed
     "maddedu", "maddedus",
     "maddhd", "maddhdu", "maddld",                      # INT multiply-and-add
-    "maddsubrs",         # Integer DCT Butterfly
+    "maddsubrs",         # Integer DCT Butterfly Add Sub and Round Shift
+    "maddrs",            # Integer DCT Butterfly Add and Accumulate and Round Shift
+    "msubrs",            # Integer DCT Butterfly Subtract From and Round Shift
     "mcrf", "mcrxr", "mcrxrx", "mfcr/mfocrf",           # CR mvs
     "mfmsr", "mfspr",
     "minmax",                     # AV bitmanip
@@ -919,6 +921,8 @@ class MicrOp(Enum):
     OP_DSHR = 102
     OP_SHADD = 103
     OP_MADDSUBRS = 104
+    OP_MADDRS = 105
+    OP_MSUBRS = 106
 
 
 class In1Sel(Enum):
diff --git a/src/openpower/test/alu/maddsubrs_cases.py b/src/openpower/test/alu/maddsubrs_cases.py
index 2a495f60..f4433fcb 100644
--- a/src/openpower/test/alu/maddsubrs_cases.py
+++ b/src/openpower/test/alu/maddsubrs_cases.py
@@ -13,76 +13,121 @@ import unittest
 class MADDSUBRSTestCase(TestAccumulatorBase):
 
     def case_0_maddsubrs(self):
-        isa = SVP64Asm(["maddsubrs 1,2,14,3"])
+        isa = SVP64Asm(["maddsubrs 1,10,14,11"])
         lst = list(isa)
 
         initial_regs = [0] * 32
         initial_regs[1] = 0x00000a71
-        initial_regs[2] = 0x0000e6b8
-        initial_regs[3] = 0x00002d41
+        initial_regs[10] = 0x0000e6b8
+        initial_regs[11] = 0x00002d41
 
         e = ExpectedState(pc=4)
         e.intregs[1] = 0x0000aa86
         e.intregs[2] = 0xffffffffffff643e
-        e.intregs[3] = 0x00002d41
+        e.intregs[10] = 0x0000e6b8
+        e.intregs[11] = 0x00002d41
         self.add_case(Program(lst, bigendian), initial_regs, expected=e)
 
     def case_1_maddsubrs(self):
-        isa = SVP64Asm(["maddsubrs 1,2,0,3"])
+        isa = SVP64Asm(["maddsubrs 1,10,0,11"])
         lst = list(isa)
 
         initial_regs = [0] * 32
         initial_regs[1] = 0x00000a71
-        initial_regs[2] = 0x0000e6b8
-        initial_regs[3] = 0x00002d41
+        initial_regs[10] = 0x0000e6b8
+        initial_regs[11] = 0x00002d41
 
         e = ExpectedState(pc=4)
         e.intregs[1] = 0x2aa17069
         e.intregs[2] = 0xffffffffd90f96f9
-        e.intregs[3] = 0x00002d41
+        e.intregs[10] = 0x0000e6b8
+        e.intregs[11] = 0x00002d41
         self.add_case(Program(lst, bigendian), initial_regs, expected=e)
 
     def case_2_maddsubrs(self):
-        isa = SVP64Asm(["maddsubrs 1,2,2,3"])
+        isa = SVP64Asm(["maddsubrs 1,10,2,11"])
         lst = list(isa)
 
         initial_regs = [0] * 32
         initial_regs[1] = 0x100000000
-        initial_regs[2] = 0x000000003
-        initial_regs[3] = 0x10000000
+        initial_regs[10] = 0x000000003
+        initial_regs[11] = 0x10000000
 
         e = ExpectedState(pc=4)
         e.intregs[1] = 0x40000000c000000;
         e.intregs[2] = 0x3fffffff4000000;
-        e.intregs[3] = 0x10000000;
+        e.intregs[10] = 0x00000003
+        e.intregs[11] = 0x10000000;
         self.add_case(Program(lst, bigendian), initial_regs, expected=e)
 
     def case_3_maddsubrs(self):
-        isa = SVP64Asm(["maddsubrs 1,2,16,3"])
+        isa = SVP64Asm(["maddsubrs 1,10,16,11"])
         lst = list(isa)
 
         initial_regs = [0] * 32
         initial_regs[1] = 0x100000000
-        initial_regs[2] = 0x000000003
-        initial_regs[3] = 0x10000000
+        initial_regs[10] = 0x000000003
+        initial_regs[11] = 0x10000000
 
         e = ExpectedState(pc=4)
         e.intregs[1] = 0x100000003000;
         e.intregs[2] = 0x0fffffffd000;
-        e.intregs[3] = 0x10000000;
+        e.intregs[10] = 0x00000003
+        e.intregs[11] = 0x10000000;
         self.add_case(Program(lst, bigendian), initial_regs, expected=e)
 
-    def case_3_maddsubrs(self):
-        isa = SVP64Asm(["maddsubrs 1,2,1,3"])
+    def case_4_maddsubrs(self):
+        isa = SVP64Asm(["maddsubrs 1,10,1,11"])
         lst = list(isa)
 
         initial_regs = [0] * 32
         initial_regs[1] = 0x100000000
-        initial_regs[2] = 0x000000003
-        initial_regs[3] = 0xff0000000
+        initial_regs[10] = 0x000000003
+        initial_regs[11] = 0xff0000000
 
         e = ExpectedState(pc=4)
         e.intregs[1] = 0xf8000017e8000000;
         e.intregs[2] = 0xf7ffffe818000000;
-        e.intregs[3] = 0xff0000000;
+        e.intregs[10] = 0x000000003
+        e.intregs[11] = 0xff0000000;
+        self.add_case(Program(lst, bigendian), initial_regs, expected=e)
+
+    def case_0_maddrs(self):
+        isa = SVP64Asm(["maddsubrs 1,10,0,11",
+                        "maddrs 1,10,0,12",
+                        "msubrs 2,10,0,12"])
+        lst = list(isa)
+
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x00000a71
+        initial_regs[10] = 0x0000e6b8
+        initial_regs[11] = 0x00002d41
+        initial_regs[12] = 0x00000d00
+
+        e = ExpectedState(pc=12)
+        e.intregs[1] = 0x3658c869
+        e.intregs[2] = 0xffffffffcd583ef9
+        e.intregs[10] = 0x0000e6b8
+        e.intregs[11] = 0x00002d41
+        e.intregs[12] = 0x00000d00
+        self.add_case(Program(lst, bigendian), initial_regs, expected=e)
+
+    def case_1_maddrs(self):
+        isa = SVP64Asm(["maddsubrs 1,10,0,11",
+                        "maddrs 1,10,14,12",
+                        "msubrs 2,10,14,12"])
+        lst = list(isa)
+
+        initial_regs = [0] * 32
+        initial_regs[1] = 0x00000a71
+        initial_regs[10] = 0x0000e6b8
+        initial_regs[11] = 0x00002d41
+        initial_regs[12] = 0x00000d00
+
+        e = ExpectedState(pc=12)
+        e.intregs[1] = 0x0000d963
+        e.intregs[2] = 0xffffffffffff3561
+        e.intregs[10] = 0x0000e6b8
+        e.intregs[11] = 0x00002d41
+        e.intregs[12] = 0x00000d00
         self.add_case(Program(lst, bigendian), initial_regs, expected=e)
-- 
2.30.2