From 33b2813ff64417b513e3ac569f42ad72574eef9f Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos.margaritis@vectorcamp.gr>
Date: Tue, 25 Jul 2023 08:54:27 +0000
Subject: [PATCH] fix docs for updated maddsubrs/maddrs/msubrs

---
 openpower/sv/twin_butterfly.mdwn | 87 +++++++++++++++++---------------
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/openpower/sv/twin_butterfly.mdwn b/openpower/sv/twin_butterfly.mdwn
index faee1c2e9..516f601f3 100644
--- a/openpower/sv/twin_butterfly.mdwn
+++ b/openpower/sv/twin_butterfly.mdwn
@@ -97,35 +97,30 @@ A-Form
     | PO   |  RT  |   RA   |   RB  |   SH   |   XO |Rc |
 ```
 
-* maddsubrs  RT,RA,SH,RB
+* maddsubrs  RT,RA,RB,SH
 
 Pseudo-code:
 
 ```
     n <- SH
-    sum <- (RT) + (RA)
-    diff <- (RT) - (RA)
+    sum <- (RT[0] || RT) + (RA[0] || RA)
+    diff <- (RT[0] || RT) - (RA[0] || RA)
     prod1 <- MULS(RB, sum)
     prod2 <- MULS(RB, diff)
     if n = 0 then
-        prod1_lo <- prod1[XLEN:(XLEN*2) - 1]
-        prod2_lo <- prod2[XLEN:(XLEN*2) - 1]
+        prod1_lo <- prod1[XLEN+1:(XLEN*2)]
+        prod2_lo <- prod2[XLEN+1:(XLEN*2)]
         RT <- prod1_lo
         RS <- prod2_lo
     else
-        round <- [0]*(XLEN*2)
-        round[XLEN*2 - n] <- 1
+        round <- [0]*(XLEN*2 + 1)
+        round[XLEN*2 - n + 1] <- 1
         prod1 <- prod1 + round
         prod2 <- prod2 + round
-        m <- MASK(XLEN - n - 2, XLEN - 1)
-        res1 <- prod1[XLEN - n:XLEN*2 - n - 1]
-        res2 <- prod2[XLEN - n:XLEN*2 - n - 1]
-        signbit1 <- prod1[0]
-        signbit2 <- prod2[0]
-        smask1 <- ([signbit1]*XLEN) & Â¬m
-        smask2 <- ([signbit2]*XLEN) & Â¬m
-        RT <- (res1 | smask1)
-        RS <- (res2 | smask2)
+        res1 <- prod1[XLEN - n + 1:XLEN*2 - n]
+        res2 <- prod2[XLEN - n + 1:XLEN*2 - n]
+        RT <- res1
+        RS <- res2
 ```
 
 Similar to `RTp`, this instruction produces an implicit result, `RS`,
@@ -139,11 +134,11 @@ Special Registers Altered:
     None
 ```
 
-# [DRAFT] Integer Butterfly Multiply Add/Sub and Accumulate FFT/DCT
+# [DRAFT] Integer Butterfly Multiply Add and Round Shift FFT/DCT
 
 A-Form
 
-* maddrs  RT,RA,SH,RB
+* maddrs  RT,RA,RB,SH
 
 Pseudo-code:
 
@@ -153,51 +148,63 @@ Pseudo-code:
     if n = 0 then
         prod_lo <- prod[XLEN:(XLEN*2) - 1]
         RT <- (RT) + prod_lo
-        RS <- (RS) - prod_lo
     else
-        res1[0:XLEN*2-1] <- (EXTSXL((RT)[0], 1) || (RT)) + prod
-        res2[0:XLEN*2-1] <- (EXTSXL((RS)[0], 1) || (RS)) - prod
+        res[0:XLEN*2-1] <- (EXTSXL((RT)[0], 1) || (RT)) + prod
         round <- [0]*XLEN*2
         round[XLEN*2 - n] <- 1
-        res1 <- res1 + round
-        res2 <- res2 + round
-        signbit1 <- res1[0]
-        signbit2 <- res2[0]
-        m <- MASK(XLEN -n - 2, XLEN - 1)
-        res1 <- res1[XLEN - n:XLEN*2 - n -1]
-        res2 <- res2[XLEN - n:XLEN*2 - n -1]
-        smask1 <- ([signbit1]*XLEN) & Â¬m
-        smask2 <- ([signbit2]*XLEN) & Â¬m
-        RT <- (res1 | smask1)
-        RS <- (res2 | smask2)
+        res <- res + round
+        RT <- res[XLEN - n:XLEN*2 - n -1]
+```
+
+Special Registers Altered:
+
+    None
+
+# [DRAFT] Integer Butterfly Multiply Sub and Round Shift FFT/DCT
+
+A-Form
+
+* msubrs  RT,RA,RB,SH
+
+Pseudo-code:
+
+```
+    n <- SH
+    prod <- MULS(RB, RA)
+    if n = 0 then
+        prod_lo <- prod[XLEN:(XLEN*2) - 1]
+        RT <- (RT) - prod_lo
+    else
+        res[0:XLEN*2-1] <- (EXTSXL((RT)[0], 1) || (RT)) - prod
+        round <- [0]*XLEN*2
+        round[XLEN*2 - n] <- 1
+        res <- res + round
+        RT <- res[XLEN - n:XLEN*2 - n -1]
 ```
 
 Special Registers Altered:
 
     None
 
-Similar to `RTp`, this instruction produces an implicit result, `RS`,
-which under Scalar circumstances is defined as `RT+1`.  For SVP64 if
-`RT` is a Vector, `RS` begins immediately after the Vector `RT` where
-the length of `RT` is set by `SVSTATE.MAXVL` (Max Vector Length).
 
-This instruction is supposed to be used in complement to the maddsubrs
+This pair of instructions is supposed to be used in complement to the maddsubrs
 to produce the double-coefficient butterfly instruction. In order for that
 to work, instead of passing c2 as coefficient, we have to pass c2-c1 instead.
 
 In essence, we are calculating the quantity `a * c1 +/- b * c1` first, with
 `maddsubrs` *without* shifting (so `SH=0`) and then we add/sub `b * (c2-c1)`
-from the previous `RT`/`RS`, and *then* do the shifting.
+from the previous `RT`, and *then* do the shifting.
 
 In the following example, assume `a` in `R1`, `b` in `R10`, `c1` in `R11` and `c2 - c1` in `R12`.
 The first instruction will put `a * c1 + b * c1` in `R1` (`RT`), `a * c1 - b * c1` in `RS`
 (here, `RS = RT +1`, so `R2`).
-Then, `maddrs` will add `b * (c2 - c1)` to `R1` (`RT`), and subtract it from `R2` (`RS`), and then
+Then, `maddrs` will add `b * (c2 - c1)` to `R1` (`RT`), and `msubrs` will subtract it from `R2` (`RS`), and then
 round shift right both quantities 14 bits:
 
 ```
     maddsubrs 1,10,0,11
-    maddrs 1,10,14,12
+    maddrs 1,10,12,14
+    msubrs 2,10,12,14
 ```
 
 In scalar code, that would take ~16 instructions for both operations.
-- 
2.30.2