prefix-sum remap works!
authorJacob Lifshay <programmerjake@gmail.com>
Fri, 28 Apr 2023 08:49:30 +0000 (01:49 -0700)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Fri, 2 Jun 2023 18:51:17 +0000 (19:51 +0100)
openpower/isa/simplev.mdwn
src/openpower/decoder/isa/test_caller_svp64_parallel_prefix_sum.py [new file with mode: 0644]
src/openpower/test/svp64/__init__.py [new file with mode: 0644]
src/openpower/test/svp64/parallel_prefix_sum.py [new file with mode: 0644]

index fae79b4c0548ac47ba699ff739307f9e3ac61c8d..33a02e6612065f290d840e15a596dfc2177de5e5 100644 (file)
@@ -278,22 +278,45 @@ Pseudo-code:
         else
             SVSHAPE0[30:31] <- 0b11          # DCT mode
         SVSHAPE0[6:11] <- 0b000101       # DCT "half-swap" mode
-    # set schedule up for parallel reduction
+    # set schedule up for parallel reduction or prefix-sum
     if (SVrm = 0b0111) then
+        # is scan/prefix-sum
+        is_scan <- SVyd = 2
         # calculate the total number of operations (brute-force)
         vlen[0:6] <- [0] * 7
         itercount[0:6] <- (0b00 || SVxd) + 0b0000001
-        step[0:6] <- 0b0000001
-        i[0:6] <- 0b0000000
-        do while step <u itercount
-            newstep <- step[1:6] || 0b0
-            j[0:6] <- 0b0000000
-            do while (j+step <u itercount)
-                j <- j + newstep
-                i <- i + 1
-            step <- newstep
-        # VL in Parallel-Reduce is the number of operations
-        vlen[0:6] <- i
+        if is_scan then
+            # prefix sum algorithm with operations replaced with
+            # incrementing vlen
+            dist <- 1
+            vlen[0:6] <- 0
+            do while dist <u itercount
+                start <- dist * 2 - 1
+                step <- dist * 2
+                i <- start
+                do while i <u itercount
+                    vlen[0:6] <- vlen[0:6] + 1
+                    i <- i + step
+                dist <- dist * 2
+            dist <- dist / 2
+            do while dist != 0
+                i <- dist * 3 - 1
+                do while i <u itercount
+                    vlen[0:6] <- vlen[0:6] + 1
+                    i <- i + dist * 2
+                dist <- dist / 2
+        else
+            step <- 0b0000001
+            i <- 0b0000000
+            do while step <u itercount
+                newstep <- step[1:6] || 0b0
+                j[0:6] <- 0b0000000
+                do while (j+step <u itercount)
+                    j <- j + newstep
+                    i <- i + 1
+                step <- newstep
+            # VL in Parallel-Reduce is the number of operations
+            vlen[0:6] <- i
         # set up template in SVSHAPE0, then copy to 1. only 2 needed
         SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
         SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
@@ -302,14 +325,11 @@ Pseudo-code:
         # copy
         SVSHAPE1[0:31] <- SVSHAPE0[0:31]
         # set up submodes: parallel or prefix
-        if (SVyd = 1) then
-            SVSHAPE0[28:29] <- 0b00   # left operand
-            SVSHAPE1[28:29] <- 0b01   # right operand
-        if (SVyd = 2) then
-            SVSHAPE0[28:29] <- 0b10   # left operand
-            SVSHAPE1[28:29] <- 0b11   # right operand
         SVSHAPE0[28:29] <- 0b00   # left operand
         SVSHAPE1[28:29] <- 0b01   # right operand
+        if is_scan then
+            SVSHAPE0[28:29] <- 0b10   # left operand
+            SVSHAPE1[28:29] <- 0b11   # right operand
     # set VL, MVL and Vertical-First
     m[0:12] <- vlen * mscale
     maxvl[0:6] <- m[6:12]
diff --git a/src/openpower/decoder/isa/test_caller_svp64_parallel_prefix_sum.py b/src/openpower/decoder/isa/test_caller_svp64_parallel_prefix_sum.py
new file mode 100644 (file)
index 0000000..951c476
--- /dev/null
@@ -0,0 +1,23 @@
+""" svp64 parallel prefix-sum tests
+"""
+
+import unittest
+
+from openpower.test.svp64.parallel_prefix_sum import ParallelPrefixSumCases
+from openpower.test.runner import TestRunnerBase
+
+# writing the test_caller invocation this way makes it work with pytest
+
+
+class TestSVP64ParallelPrefixSum(TestRunnerBase):
+    def __init__(self, test):
+        assert test == 'test'
+        super().__init__(ParallelPrefixSumCases().test_data)
+
+    def test(self):
+        # dummy function to make unittest try to test this class
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/openpower/test/svp64/__init__.py b/src/openpower/test/svp64/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/openpower/test/svp64/parallel_prefix_sum.py b/src/openpower/test/svp64/parallel_prefix_sum.py
new file mode 100644 (file)
index 0000000..983e196
--- /dev/null
@@ -0,0 +1,53 @@
+import itertools
+import operator
+from openpower.simulator.program import Program
+from openpower.sv.trans.svp64 import SVP64Asm
+from openpower.test.state import ExpectedState
+from openpower.test.common import TestAccumulatorBase, skip_case
+from nmutil.prefix_sum import prefix_sum, prefix_sum_ops
+
+
+class ParallelPrefixSumCases(TestAccumulatorBase):
+    def case_prefix_sum(self):
+        inp = 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+        expected = prefix_sum(inp, fn=operator.add, work_efficient=True)
+        assert expected == [0x1, 0x3, 0x7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF]
+        gprs = [0] * 32
+        for i, v in enumerate(inp):
+            gprs[i + 10] = v
+        len_inp = len(inp)
+        prog = Program(list(SVP64Asm([
+            # setup SVSHAPE[01] and VL/MAXVL for prefix-sum
+            f"svshape {len_inp}, 3, 1, 0x7, 0",
+            # activate SVSHAPE0 (prefix-sum lhs) for RA
+            # activate SVSHAPE1 (prefix-sum rhs) for RT and RB
+            "svremap 0o13, 0, 1, 0, 1, 0, 0",
+            "sv.add *10, *10, *10",
+        ])), False)
+        e = ExpectedState(pc=0x10, int_regs=gprs)
+        for i, v in enumerate(expected):
+            e.intregs[i + 10] = v
+        self.add_case(prog, gprs, expected=e)
+
+    def case_scan_sub(self):
+        inp = list(range(8))
+        expected = prefix_sum(inp, fn=operator.sub, work_efficient=True)
+        assert expected == [0, -1, -3, 0, -4, 1, -5, 0]
+        expected = [i % 2 ** 64 for i in expected]  # cast to u64
+        gprs = [0] * 32
+        for i, v in enumerate(inp):
+            gprs[i + 10] = v
+        len_inp = len(inp)
+        prog = Program(list(SVP64Asm([
+            # setup SVSHAPE[01] and VL/MAXVL for prefix-sum
+            f"svshape {len_inp}, 3, 1, 0x7, 0",
+            # note subf has RA/RB reversed from normal sub
+            # activate SVSHAPE0 (prefix-sum lhs) for RB (not RA)
+            # activate SVSHAPE1 (prefix-sum rhs) for RT and RA (not RB)
+            "svremap 0o13, 1, 0, 0, 1, 0, 0",
+            "sv.subf *10, *10, *10",
+        ])), False)
+        e = ExpectedState(pc=0x10, int_regs=gprs)
+        for i, v in enumerate(expected):
+            e.intregs[i + 10] = v
+        self.add_case(prog, gprs, expected=e)