else
SVSHAPE0[30:31] <- 0b11 # DCT mode
SVSHAPE0[6:11] <- 0b000101 # DCT "half-swap" mode
- # set schedule up for parallel reduction
+ # set schedule up for parallel reduction or prefix-sum
if (SVrm = 0b0111) then
+ # is scan/prefix-sum
+ is_scan <- SVyd = 2
# calculate the total number of operations (brute-force)
vlen[0:6] <- [0] * 7
itercount[0:6] <- (0b00 || SVxd) + 0b0000001
- step[0:6] <- 0b0000001
- i[0:6] <- 0b0000000
- do while step <u itercount
- newstep <- step[1:6] || 0b0
- j[0:6] <- 0b0000000
- do while (j+step <u itercount)
- j <- j + newstep
- i <- i + 1
- step <- newstep
- # VL in Parallel-Reduce is the number of operations
- vlen[0:6] <- i
+ if is_scan then
+ # prefix sum algorithm with operations replaced with
+ # incrementing vlen
+ dist <- 1
+ vlen[0:6] <- 0
+ do while dist <u itercount
+ start <- dist * 2 - 1
+ step <- dist * 2
+ i <- start
+ do while i <u itercount
+ vlen[0:6] <- vlen[0:6] + 1
+ i <- i + step
+ dist <- dist * 2
+ dist <- dist / 2
+ do while dist != 0
+ i <- dist * 3 - 1
+ do while i <u itercount
+ vlen[0:6] <- vlen[0:6] + 1
+ i <- i + dist * 2
+ dist <- dist / 2
+ else
+ step <- 0b0000001
+ i <- 0b0000000
+ do while step <u itercount
+ newstep <- step[1:6] || 0b0
+ j[0:6] <- 0b0000000
+ do while (j+step <u itercount)
+ j <- j + newstep
+ i <- i + 1
+ step <- newstep
+ # VL in Parallel-Reduce is the number of operations
+ vlen[0:6] <- i
# set up template in SVSHAPE0, then copy to 1. only 2 needed
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
# copy
SVSHAPE1[0:31] <- SVSHAPE0[0:31]
# set up submodes: parallel or prefix
- if (SVyd = 1) then
- SVSHAPE0[28:29] <- 0b00 # left operand
- SVSHAPE1[28:29] <- 0b01 # right operand
- if (SVyd = 2) then
- SVSHAPE0[28:29] <- 0b10 # left operand
- SVSHAPE1[28:29] <- 0b11 # right operand
SVSHAPE0[28:29] <- 0b00 # left operand
SVSHAPE1[28:29] <- 0b01 # right operand
+ if is_scan then
+ SVSHAPE0[28:29] <- 0b10 # left operand
+ SVSHAPE1[28:29] <- 0b11 # right operand
# set VL, MVL and Vertical-First
m[0:12] <- vlen * mscale
maxvl[0:6] <- m[6:12]
--- /dev/null
+import itertools
+import operator
+from openpower.simulator.program import Program
+from openpower.sv.trans.svp64 import SVP64Asm
+from openpower.test.state import ExpectedState
+from openpower.test.common import TestAccumulatorBase, skip_case
+from nmutil.prefix_sum import prefix_sum, prefix_sum_ops
+
+
+class ParallelPrefixSumCases(TestAccumulatorBase):
+ def case_prefix_sum(self):
+ inp = 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ expected = prefix_sum(inp, fn=operator.add, work_efficient=True)
+ assert expected == [0x1, 0x3, 0x7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF]
+ gprs = [0] * 32
+ for i, v in enumerate(inp):
+ gprs[i + 10] = v
+ len_inp = len(inp)
+ prog = Program(list(SVP64Asm([
+ # setup SVSHAPE[01] and VL/MAXVL for prefix-sum
+ f"svshape {len_inp}, 3, 1, 0x7, 0",
+ # activate SVSHAPE0 (prefix-sum lhs) for RA
+ # activate SVSHAPE1 (prefix-sum rhs) for RT and RB
+ "svremap 0o13, 0, 1, 0, 1, 0, 0",
+ "sv.add *10, *10, *10",
+ ])), False)
+ e = ExpectedState(pc=0x10, int_regs=gprs)
+ for i, v in enumerate(expected):
+ e.intregs[i + 10] = v
+ self.add_case(prog, gprs, expected=e)
+
+ def case_scan_sub(self):
+ inp = list(range(8))
+ expected = prefix_sum(inp, fn=operator.sub, work_efficient=True)
+ assert expected == [0, -1, -3, 0, -4, 1, -5, 0]
+ expected = [i % 2 ** 64 for i in expected] # cast to u64
+ gprs = [0] * 32
+ for i, v in enumerate(inp):
+ gprs[i + 10] = v
+ len_inp = len(inp)
+ prog = Program(list(SVP64Asm([
+ # setup SVSHAPE[01] and VL/MAXVL for prefix-sum
+ f"svshape {len_inp}, 3, 1, 0x7, 0",
+ # note subf has RA/RB reversed from normal sub
+ # activate SVSHAPE0 (prefix-sum lhs) for RB (not RA)
+ # activate SVSHAPE1 (prefix-sum rhs) for RT and RA (not RB)
+ "svremap 0o13, 1, 0, 0, 1, 0, 0",
+ "sv.subf *10, *10, *10",
+ ])), False)
+ e = ExpectedState(pc=0x10, int_regs=gprs)
+ for i, v in enumerate(expected):
+ e.intregs[i + 10] = v
+ self.add_case(prog, gprs, expected=e)