# for convenience, VL to be calculated and stored in SVSTATE
vlen <- [0] * 7
+ mscale[0:6] <- 0b0000001 # for scaling MAXVL
itercount[0:6] <- [0] * 7
SVSTATE[0:31] <- [0] * 32
# only overwrite REMAP if "persistence" is zero
# for FRA and FRT
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D FFT)
+ mscale <- (0b0 || SVzd) + 1
SVSHAPE0[30:31] <- 0b01 # Butterfly mode
# copy
SVSHAPE1[0:31] <- SVSHAPE0[0:31]
# set up FRB and FRS
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
if (SVrm = 0b1010) | (SVrm = 0b1100) then
SVSHAPE0[30:31] <- 0b11 # iDCT mode
SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode
# set up FRB and FRS
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
if (SVrm = 0b1011) then
SVSHAPE0[30:31] <- 0b11 # iDCT mode
SVSHAPE0[18:20] <- 0b011 # iDCT Outer Butterfly sub-mode
# set up FRB and FRS
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode
SVSHAPE0[6:11] <- 0b000100 # DCT Inner Butterfly COS-gen mode
if (SVrm = 0b0101) then
# set up template in SVSHAPE0
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
if (SVrm = 0b1110) then
SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap
if (SVrm = 0b1111) then
# set up template in SVSHAPE0, then copy to 1. only 2 needed
SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
SVSHAPE0[30:31] <- 0b10 # parallel reduce submode
# copy
SVSHAPE1[0:31] <- SVSHAPE0[0:31]
# set up right operand (left operand 28:29 is zero)
SVSHAPE1[28:29] <- 0b01 # right operand
# set VL, MVL and Vertical-First
- SVSTATE[0:6] <- vlen
- SVSTATE[7:13] <- vlen
+ m[0:12] <- vlen * mscale
+ maxvl[0:6] <- m[6:12]
+ SVSTATE[0:6] <- vlen # VL
+ SVSTATE[7:13] <- maxvl # MAVXL
SVSTATE[63] <- vf
Special Registers Altered:
self.assertEqual(sim.fpr(i+0), t)
self.assertEqual(sim.fpr(i+4), u)
- def test_sv_remap_fpmadds_dct_inner_4(self):
+ def test_sv_remap_fpmadds_dct_inner_4(self, stride=1):
""">>> lst = ["svshape 4, 1, 1, 2, 0",
"svremap 27, 1, 0, 2, 0, 1, 0",
- "sv.fdmadds *0, *0, *0, *8"
+ "sv.fdmadds *0, *0, *0, *32"
]
runs a full in-place 4-long O(N log2 N) inner butterfly schedule
for DCT
is straight Vectorised (0123...) because DCT coefficients
cannot be shared between butterfly layers (due to +0.5)
"""
- lst = SVP64Asm( ["svshape 4, 1, 1, 2, 0",
+ lst = SVP64Asm( ["svshape 4, 1, %d, 2, 0" % stride,
"svremap 27, 1, 0, 2, 0, 1, 0",
- "sv.fdmadds *0, *0, *0, *8"
+ "sv.fdmadds *0, *0, *0, *32"
])
lst = list(lst)
av = [av[ri[i]] for i in range(n)]
# store in regfile
- fprs = [0] * 32
+ fprs = [0] * 64
for i, c in enumerate(coe):
- fprs[i+8] = fp64toselectable(1.0 / c) # invert
+ fprs[i*stride+32] = fp64toselectable(1.0 / c) # invert
for i, a in enumerate(av):
- fprs[i+0] = fp64toselectable(a)
+ fprs[i*stride+0] = fp64toselectable(a)
with Program(lst, bigendian=False) as program:
sim = self.run_tst_program(program, initial_fprs=fprs)
res = transform_inner_radix2_dct(avi, coe)
for i, expected in enumerate(res):
- print ("i", i, float(sim.fpr(i)), "expected", expected)
+ print ("i", i*stride, float(sim.fpr(i*stride)),
+ "expected", expected)
for i, expected in enumerate(res):
# convert to Power single
expected = fph.DOUBLE2SINGLE(fp64toselectable(expected))
expected = float(expected)
- actual = float(sim.fpr(i))
+ actual = float(sim.fpr(i*stride))
# approximate error calculation, good enough test
# reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB
# and the rounding is different