<!-- This defines Draft SVP64 instructions to augment PowerISA Version 3.0 -->
<!-- These are not described in book 1 -->
-# svstep
+[[!inline pagenames="openpower/isa/simplev/svstep" raw="yes"]]
-SVL-Form
+[[!inline pagenames="openpower/isa/simplev/setvl" raw="yes"]]
-* svstep RT,SVi,vf (Rc=0)
-* svstep. RT,SVi,vf (Rc=1)
+[[!inline pagenames="openpower/isa/simplev/svremap" raw="yes"]]
-Pseudo-code:
+[[!inline pagenames="openpower/isa/simplev/svshape" raw="yes"]]
- if SVi[3:4] = 0b11 then
- # store pack and unpack in SVSTATE
- SVSTATE[53] <- SVi[5]
- SVSTATE[54] <- SVi[6]
- RT <- [0]*62 || SVSTATE[53:54]
- else
- step <- SVSTATE_NEXT(SVi, vf)
- RT <- [0]*57 || step
-
-Special Registers Altered:
-
- CR0 (if Rc=1)
-
-# setvl
-
-SVL-Form
-
-* setvl RT,RA,SVi,vf,vs,ms (Rc=0)
-* setvl. RT,RA,SVi,vf,vs,ms (Rc=1)
-
-Pseudo-code:
-
- overflow <- 0b0
- VLimm <- SVi + 1
- # set or get MVL
- if ms = 1 then MVL <- VLimm[0:6]
- else MVL <- SVSTATE[0:6]
- # set or get VL
- if vs = 0 then VL <- SVSTATE[7:13]
- else if _RA != 0 then
- if (RA) >u 0b1111111 then
- VL <- 0b1111111
- overflow <- 0b1
- else VL <- (RA)[57:63]
- else if _RT = 0 then VL <- VLimm[0:6]
- else if CTR >u 0b1111111 then
- VL <- 0b1111111
- overflow <- 0b1
- else VL <- CTR[57:63]
- # limit VL to within MVL
- if VL >u MVL then
- overflow <- 0b1
- VL <- MVL
- SVSTATE[0:6] <- MVL
- SVSTATE[7:13] <- VL
- if _RT != 0 then
- GPR(_RT) <- [0]*57 || VL
- # MAXVL is a static "state-reset".
- if ms = 1 then
- SVSTATE[63] <- vf # set Vertical-First mode
- SVSTATE[62] <- 0b0 # clear persist bit
-
-Special Registers Altered:
-
- CR0 (if Rc=1)
-
-# svremap
-
-SVRM-Form
-
-* svremap SVme,mi0,mi1,mi2,mo0,mo1,pst
-
-Pseudo-code:
-
- # registers RA RB RC RT EA/FRS SVSHAPE0-3 indices
- SVSTATE[32:33] <- mi0
- SVSTATE[34:35] <- mi1
- SVSTATE[36:37] <- mi2
- SVSTATE[38:39] <- mo0
- SVSTATE[40:41] <- mo1
- # enable bit for RA RB RC RT EA/FRS
- SVSTATE[42:46] <- SVme
- # persistence bit (applies to more than one instruction)
- SVSTATE[62] <- pst
-
-Special Registers Altered:
-
- None
-
-# svshape
-
-SVM-Form
-
-* svshape SVxd,SVyd,SVzd,SVrm,vf
-
-Pseudo-code:
-
- # for convenience, VL to be calculated and stored in SVSTATE
- vlen <- [0] * 7
- mscale[0:5] <- 0b000001 # for scaling MAXVL
- itercount[0:6] <- [0] * 7
- SVSTATE[0:31] <- [0] * 32
- # only overwrite REMAP if "persistence" is zero
- if (SVSTATE[62] = 0b0) then
- SVSTATE[32:33] <- 0b00
- SVSTATE[34:35] <- 0b00
- SVSTATE[36:37] <- 0b00
- SVSTATE[38:39] <- 0b00
- SVSTATE[40:41] <- 0b00
- SVSTATE[42:46] <- 0b00000
- SVSTATE[62] <- 0b0
- SVSTATE[63] <- 0b0
- # clear out all SVSHAPEs
- SVSHAPE0[0:31] <- [0] * 32
- SVSHAPE1[0:31] <- [0] * 32
- SVSHAPE2[0:31] <- [0] * 32
- SVSHAPE3[0:31] <- [0] * 32
- # set schedule up for multiply
- if (SVrm = 0b0000) then
- # VL in Matrix Multiply is xd*yd*zd
- xd <- (0b00 || SVxd) + 1
- yd <- (0b00 || SVyd) + 1
- zd <- (0b00 || SVzd) + 1
- n <- xd * yd * zd
- vlen[0:6] <- n[14:20]
- # set up template in SVSHAPE0, then copy to 1-3
- SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
- SVSHAPE0[6:11] <- (0b0 || SVyd) # ydim
- SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim
- SVSHAPE0[28:29] <- 0b11 # skip z
- # copy
- SVSHAPE1[0:31] <- SVSHAPE0[0:31]
- SVSHAPE2[0:31] <- SVSHAPE0[0:31]
- SVSHAPE3[0:31] <- SVSHAPE0[0:31]
- # set up FRA
- SVSHAPE1[18:20] <- 0b001 # permute x,z,y
- SVSHAPE1[28:29] <- 0b01 # skip z
- # FRC
- SVSHAPE2[18:20] <- 0b001 # permute x,z,y
- SVSHAPE2[28:29] <- 0b11 # skip y
- # set schedule up for FFT butterfly
- if (SVrm = 0b0001) then
- # calculate O(N log2 N)
- n <- [0] * 3
- do while n < 5
- if SVxd[4-n] = 0 then
- leave
- n <- n + 1
- n <- ((0b0 || SVxd) + 1) * n
- vlen[0:6] <- n[1:7]
- # set up template in SVSHAPE0, then copy to 1-3
- # for FRA and FRT
- SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
- SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D FFT)
- mscale <- (0b0 || SVzd) + 1
- SVSHAPE0[30:31] <- 0b01 # Butterfly mode
- # copy
- SVSHAPE1[0:31] <- SVSHAPE0[0:31]
- SVSHAPE2[0:31] <- SVSHAPE0[0:31]
- # set up FRB and FRS
- SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
- # FRC (coefficients)
- SVSHAPE2[28:29] <- 0b10 # k schedule
- # set schedule up for (i)DCT Inner butterfly
- # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode)
- if ((SVrm = 0b0100) |
- (SVrm = 0b1100)) then
- # calculate O(N log2 N)
- n <- [0] * 3
- do while n < 5
- if SVxd[4-n] = 0 then
- leave
- n <- n + 1
- n <- ((0b0 || SVxd) + 1) * n
- vlen[0:6] <- n[1:7]
- # set up template in SVSHAPE0, then copy to 1-3
- # set up FRB and FRS
- SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
- SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
- mscale <- (0b0 || SVzd) + 1
- if (SVrm = 0b1100) then
- SVSHAPE0[30:31] <- 0b11 # iDCT mode
- SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode
- else
- SVSHAPE0[30:31] <- 0b01 # DCT mode
- SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode
- SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop
- SVSHAPE0[6:11] <- 0b000011 # (i)DCT Inner Butterfly mode 4
- # copy
- SVSHAPE1[0:31] <- SVSHAPE0[0:31]
- SVSHAPE2[0:31] <- SVSHAPE0[0:31]
- if (SVrm != 0b0100) & (SVrm != 0b1100) then
- SVSHAPE3[0:31] <- SVSHAPE0[0:31]
- # for FRA and FRT
- SVSHAPE0[28:29] <- 0b01 # j+halfstep schedule
- # for cos coefficient
- SVSHAPE2[28:29] <- 0b10 # ci (k for mode 4) schedule
- SVSHAPE2[12:17] <- 0b000000 # reset costable "striding" to 1
- if (SVrm != 0b0100) & (SVrm != 0b1100) then
- SVSHAPE3[28:29] <- 0b11 # size schedule
- # set schedule up for (i)DCT Outer butterfly
- if (SVrm = 0b0011) | (SVrm = 0b1011) then
- # calculate O(N log2 N) number of outer butterfly overlapping adds
- vlen[0:6] <- [0] * 7
- n <- 0b000
- size <- 0b0000001
- itercount[0:6] <- (0b00 || SVxd) + 0b0000001
- itercount[0:6] <- (0b0 || itercount[0:5])
- do while n < 5
- if SVxd[4-n] = 0 then
- leave
- n <- n + 1
- count <- (itercount - 0b0000001) * size
- vlen[0:6] <- vlen + count[7:13]
- size[0:6] <- (size[1:6] || 0b0)
- itercount[0:6] <- (0b0 || itercount[0:5])
- # set up template in SVSHAPE0, then copy to 1-3
- # set up FRB and FRS
- SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
- SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
- mscale <- (0b0 || SVzd) + 1
- if (SVrm = 0b1011) then
- SVSHAPE0[30:31] <- 0b11 # iDCT mode
- SVSHAPE0[18:20] <- 0b011 # iDCT Outer Butterfly sub-mode
- SVSHAPE0[21:23] <- 0b101 # "inverse" on outer and inner loop
- else
- SVSHAPE0[30:31] <- 0b01 # DCT mode
- SVSHAPE0[18:20] <- 0b100 # DCT Outer Butterfly sub-mode
- SVSHAPE0[6:11] <- 0b000010 # DCT Butterfly mode
- # copy
- SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule
- SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients
- # for FRA and FRT
- SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
- # reset costable "striding" to 1
- SVSHAPE2[12:17] <- 0b000000
- # set schedule up for DCT COS table generation
- if (SVrm = 0b0101) | (SVrm = 0b1101) then
- # calculate O(N log2 N)
- vlen[0:6] <- [0] * 7
- itercount[0:6] <- (0b00 || SVxd) + 0b0000001
- itercount[0:6] <- (0b0 || itercount[0:5])
- n <- [0] * 3
- do while n < 5
- if SVxd[4-n] = 0 then
- leave
- n <- n + 1
- vlen[0:6] <- vlen + itercount
- itercount[0:6] <- (0b0 || itercount[0:5])
- # set up template in SVSHAPE0, then copy to 1-3
- # set up FRB and FRS
- SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
- SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
- mscale <- (0b0 || SVzd) + 1
- SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode
- SVSHAPE0[6:11] <- 0b000100 # DCT Inner Butterfly COS-gen mode
- if (SVrm = 0b0101) then
- SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop for DCT
- # copy
- SVSHAPE1[0:31] <- SVSHAPE0[0:31]
- SVSHAPE2[0:31] <- SVSHAPE0[0:31]
- # for cos coefficient
- SVSHAPE1[28:29] <- 0b10 # ci schedule
- SVSHAPE2[28:29] <- 0b11 # size schedule
- # set schedule up for iDCT / DCT inverse of half-swapped ordering
- if (SVrm = 0b0110) | (SVrm = 0b1110) | (SVrm = 0b1111) then
- vlen[0:6] <- (0b00 || SVxd) + 0b0000001
- # set up template in SVSHAPE0
- SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
- SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
- mscale <- (0b0 || SVzd) + 1
- if (SVrm = 0b1110) then
- SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap
- if (SVrm = 0b1111) then
- SVSHAPE0[30:31] <- 0b01 # FFT mode
- else
- SVSHAPE0[30:31] <- 0b11 # DCT mode
- SVSHAPE0[6:11] <- 0b000101 # DCT "half-swap" mode
- # set schedule up for parallel reduction or prefix-sum
- if (SVrm = 0b0111) then
- # is scan/prefix-sum
- is_scan <- SVyd = 2
- # calculate the total number of operations (brute-force)
- vlen[0:6] <- [0] * 7
- itercount[0:6] <- (0b00 || SVxd) + 0b0000001
- if is_scan then
- # prefix sum algorithm with operations replaced with
- # incrementing vlen
- dist <- 1
- vlen[0:6] <- 0
- do while dist <u itercount
- start <- dist * 2 - 1
- step <- dist * 2
- i <- start
- do while i <u itercount
- vlen[0:6] <- vlen[0:6] + 1
- i <- i + step
- dist <- dist * 2
- dist <- dist / 2
- do while dist != 0
- i <- dist * 3 - 1
- do while i <u itercount
- vlen[0:6] <- vlen[0:6] + 1
- i <- i + dist * 2
- dist <- dist / 2
- else
- step <- 0b0000001
- i <- 0b0000000
- do while step <u itercount
- newstep <- step[1:6] || 0b0
- j[0:6] <- 0b0000000
- do while (j+step <u itercount)
- j <- j + newstep
- i <- i + 1
- step <- newstep
- # VL in Parallel-Reduce is the number of operations
- vlen[0:6] <- i
- # set up template in SVSHAPE0, then copy to 1. only 2 needed
- SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
- SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
- mscale <- (0b0 || SVzd) + 1
- SVSHAPE0[30:31] <- 0b10 # parallel reduce/prefix submode
- # copy
- SVSHAPE1[0:31] <- SVSHAPE0[0:31]
- # set up submodes: parallel or prefix
- SVSHAPE0[28:29] <- 0b00 # left operand
- SVSHAPE1[28:29] <- 0b01 # right operand
- if is_scan then
- SVSHAPE0[28:29] <- 0b10 # left operand
- SVSHAPE1[28:29] <- 0b11 # right operand
- # set VL, MVL and Vertical-First
- m[0:12] <- vlen * mscale
- maxvl[0:6] <- m[6:12]
- SVSTATE[0:6] <- maxvl # MAVXL
- SVSTATE[7:13] <- vlen # VL
- SVSTATE[63] <- vf
-
-Special Registers Altered:
-
- None
-
-# svindex
-
-SVI-Form
-
-* svindex SVG,rmm,SVd,ew,SVyx,mm,sk
-
-Pseudo-code:
-
- # based on nearest MAXVL compute other dimension
- MVL <- SVSTATE[0:6]
- d <- [0] * 6
- dim <- SVd+1
- do while d*dim <u ([0]*4 || MVL)
- d <- d + 1
- # set up template, then copy once location identified
- shape <- [0]*32
- shape[30:31] <- 0b00 # mode
- if SVyx = 0 then
- shape[18:20] <- 0b110 # indexed xd/yd
- shape[0:5] <- (0b0 || SVd) # xdim
- if sk = 0 then shape[6:11] <- 0 # ydim
- else shape[6:11] <- 0b111111 # ydim max
- else
- shape[18:20] <- 0b111 # indexed yd/xd
- if sk = 1 then shape[6:11] <- 0 # ydim
- else shape[6:11] <- d-1 # ydim max
- shape[0:5] <- (0b0 || SVd) # ydim
- shape[12:17] <- (0b0 || SVG) # SVGPR
- shape[28:29] <- ew # element-width override
- shape[21] <- sk # skip 1st dimension
- # select the mode for updating SVSHAPEs
- SVSTATE[62] <- mm # set or clear persistence
- if mm = 0 then
- # clear out all SVSHAPEs first
- SVSHAPE0[0:31] <- [0] * 32
- SVSHAPE1[0:31] <- [0] * 32
- SVSHAPE2[0:31] <- [0] * 32
- SVSHAPE3[0:31] <- [0] * 32
- SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
- SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
- idx <- 0
- for bit = 0 to 4
- if rmm[4-bit] then
- # activate requested shape
- if idx = 0 then SVSHAPE0 <- shape
- if idx = 1 then SVSHAPE1 <- shape
- if idx = 2 then SVSHAPE2 <- shape
- if idx = 3 then SVSHAPE3 <- shape
- SVSTATE[bit*2+32:bit*2+33] <- idx
- # increment shape index, modulo 4
- if idx = 3 then idx <- 0
- else idx <- idx + 1
- else
- # refined SVSHAPE/REMAP update mode
- bit <- rmm[0:2]
- idx <- rmm[3:4]
- if idx = 0 then SVSHAPE0 <- shape
- if idx = 1 then SVSHAPE1 <- shape
- if idx = 2 then SVSHAPE2 <- shape
- if idx = 3 then SVSHAPE3 <- shape
- SVSTATE[bit*2+32:bit*2+33] <- idx
- SVSTATE[46-bit] <- 1
-
-Special Registers Altered:
-
- None
-
-# svshape2
-
-SVM2-Form
-
-* svshape2 SVo,SVyx,rmm,SVd,sk,mm
-
-Pseudo-code:
-
- # based on nearest MAXVL compute other dimension
- MVL <- SVSTATE[0:6]
- d <- [0] * 6
- dim <- SVd+1
- do while d*dim <u ([0]*4 || MVL)
- d <- d + 1
- # set up template, then copy once location identified
- shape <- [0]*32
- shape[30:31] <- 0b00 # mode
- shape[0:5] <- (0b0 || SVd) # x/ydim
- if SVyx = 0 then
- shape[18:20] <- 0b000 # ordering xd/yd(/zd)
- if sk = 0 then shape[6:11] <- 0 # ydim
- else shape[6:11] <- 0b111111 # ydim max
- else
- shape[18:20] <- 0b010 # ordering yd/xd(/zd)
- if sk = 1 then shape[6:11] <- 0 # ydim
- else shape[6:11] <- d-1 # ydim max
- # offset (the prime purpose of this instruction)
- shape[24:27] <- SVo # offset
- if sk = 1 then shape[28:29] <- 0b01 # skip 1st dimension
- else shape[28:29] <- 0b00 # no skipping
- # select the mode for updating SVSHAPEs
- SVSTATE[62] <- mm # set or clear persistence
- if mm = 0 then
- # clear out all SVSHAPEs first
- SVSHAPE0[0:31] <- [0] * 32
- SVSHAPE1[0:31] <- [0] * 32
- SVSHAPE2[0:31] <- [0] * 32
- SVSHAPE3[0:31] <- [0] * 32
- SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
- SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
- idx <- 0
- for bit = 0 to 4
- if rmm[4-bit] then
- # activate requested shape
- if idx = 0 then SVSHAPE0 <- shape
- if idx = 1 then SVSHAPE1 <- shape
- if idx = 2 then SVSHAPE2 <- shape
- if idx = 3 then SVSHAPE3 <- shape
- SVSTATE[bit*2+32:bit*2+33] <- idx
- # increment shape index, modulo 4
- if idx = 3 then idx <- 0
- else idx <- idx + 1
- else
- # refined SVSHAPE/REMAP update mode
- bit <- rmm[0:2]
- idx <- rmm[3:4]
- if idx = 0 then SVSHAPE0 <- shape
- if idx = 1 then SVSHAPE1 <- shape
- if idx = 2 then SVSHAPE2 <- shape
- if idx = 3 then SVSHAPE3 <- shape
- SVSTATE[bit*2+32:bit*2+33] <- idx
- SVSTATE[46-bit] <- 1
-
-Special Registers Altered:
-
- None
+[[!inline pagenames="openpower/isa/simplev/svindex" raw="yes"]]
+[[!inline pagenames="openpower/isa/simplev/svshape2" raw="yes"]]
--- /dev/null
+ # for convenience, VL to be calculated and stored in SVSTATE
+ vlen <- [0] * 7
+ mscale[0:5] <- 0b000001 # for scaling MAXVL
+ itercount[0:6] <- [0] * 7
+ SVSTATE[0:31] <- [0] * 32
+ # only overwrite REMAP if "persistence" is zero
+ if (SVSTATE[62] = 0b0) then
+ SVSTATE[32:33] <- 0b00
+ SVSTATE[34:35] <- 0b00
+ SVSTATE[36:37] <- 0b00
+ SVSTATE[38:39] <- 0b00
+ SVSTATE[40:41] <- 0b00
+ SVSTATE[42:46] <- 0b00000
+ SVSTATE[62] <- 0b0
+ SVSTATE[63] <- 0b0
+ # clear out all SVSHAPEs
+ SVSHAPE0[0:31] <- [0] * 32
+ SVSHAPE1[0:31] <- [0] * 32
+ SVSHAPE2[0:31] <- [0] * 32
+ SVSHAPE3[0:31] <- [0] * 32
+ # set schedule up for multiply
+ if (SVrm = 0b0000) then
+ # VL in Matrix Multiply is xd*yd*zd
+ xd <- (0b00 || SVxd) + 1
+ yd <- (0b00 || SVyd) + 1
+ zd <- (0b00 || SVzd) + 1
+ n <- xd * yd * zd
+ vlen[0:6] <- n[14:20]
+ # set up template in SVSHAPE0, then copy to 1-3
+ SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
+ SVSHAPE0[6:11] <- (0b0 || SVyd) # ydim
+ SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim
+ SVSHAPE0[28:29] <- 0b11 # skip z
+ # copy
+ SVSHAPE1[0:31] <- SVSHAPE0[0:31]
+ SVSHAPE2[0:31] <- SVSHAPE0[0:31]
+ SVSHAPE3[0:31] <- SVSHAPE0[0:31]
+ # set up FRA
+ SVSHAPE1[18:20] <- 0b001 # permute x,z,y
+ SVSHAPE1[28:29] <- 0b01 # skip z
+ # FRC
+ SVSHAPE2[18:20] <- 0b001 # permute x,z,y
+ SVSHAPE2[28:29] <- 0b11 # skip y
+ # set schedule up for FFT butterfly
+ if (SVrm = 0b0001) then
+ # calculate O(N log2 N)
+ n <- [0] * 3
+ do while n < 5
+ if SVxd[4-n] = 0 then
+ leave
+ n <- n + 1
+ n <- ((0b0 || SVxd) + 1) * n
+ vlen[0:6] <- n[1:7]
+ # set up template in SVSHAPE0, then copy to 1-3
+ # for FRA and FRT
+ SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
+ SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D FFT)
+ mscale <- (0b0 || SVzd) + 1
+ SVSHAPE0[30:31] <- 0b01 # Butterfly mode
+ # copy
+ SVSHAPE1[0:31] <- SVSHAPE0[0:31]
+ SVSHAPE2[0:31] <- SVSHAPE0[0:31]
+ # set up FRB and FRS
+ SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
+ # FRC (coefficients)
+ SVSHAPE2[28:29] <- 0b10 # k schedule
+ # set schedule up for (i)DCT Inner butterfly
+ # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode)
+ if ((SVrm = 0b0100) |
+ (SVrm = 0b1100)) then
+ # calculate O(N log2 N)
+ n <- [0] * 3
+ do while n < 5
+ if SVxd[4-n] = 0 then
+ leave
+ n <- n + 1
+ n <- ((0b0 || SVxd) + 1) * n
+ vlen[0:6] <- n[1:7]
+ # set up template in SVSHAPE0, then copy to 1-3
+ # set up FRB and FRS
+ SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
+ SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
+ if (SVrm = 0b1100) then
+ SVSHAPE0[30:31] <- 0b11 # iDCT mode
+ SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode
+ else
+ SVSHAPE0[30:31] <- 0b01 # DCT mode
+ SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode
+ SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop
+ SVSHAPE0[6:11] <- 0b000011 # (i)DCT Inner Butterfly mode 4
+ # copy
+ SVSHAPE1[0:31] <- SVSHAPE0[0:31]
+ SVSHAPE2[0:31] <- SVSHAPE0[0:31]
+ if (SVrm != 0b0100) & (SVrm != 0b1100) then
+ SVSHAPE3[0:31] <- SVSHAPE0[0:31]
+ # for FRA and FRT
+ SVSHAPE0[28:29] <- 0b01 # j+halfstep schedule
+ # for cos coefficient
+ SVSHAPE2[28:29] <- 0b10 # ci (k for mode 4) schedule
+ SVSHAPE2[12:17] <- 0b000000 # reset costable "striding" to 1
+ if (SVrm != 0b0100) & (SVrm != 0b1100) then
+ SVSHAPE3[28:29] <- 0b11 # size schedule
+ # set schedule up for (i)DCT Outer butterfly
+ if (SVrm = 0b0011) | (SVrm = 0b1011) then
+ # calculate O(N log2 N) number of outer butterfly overlapping adds
+ vlen[0:6] <- [0] * 7
+ n <- 0b000
+ size <- 0b0000001
+ itercount[0:6] <- (0b00 || SVxd) + 0b0000001
+ itercount[0:6] <- (0b0 || itercount[0:5])
+ do while n < 5
+ if SVxd[4-n] = 0 then
+ leave
+ n <- n + 1
+ count <- (itercount - 0b0000001) * size
+ vlen[0:6] <- vlen + count[7:13]
+ size[0:6] <- (size[1:6] || 0b0)
+ itercount[0:6] <- (0b0 || itercount[0:5])
+ # set up template in SVSHAPE0, then copy to 1-3
+ # set up FRB and FRS
+ SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
+ SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
+ if (SVrm = 0b1011) then
+ SVSHAPE0[30:31] <- 0b11 # iDCT mode
+ SVSHAPE0[18:20] <- 0b011 # iDCT Outer Butterfly sub-mode
+ SVSHAPE0[21:23] <- 0b101 # "inverse" on outer and inner loop
+ else
+ SVSHAPE0[30:31] <- 0b01 # DCT mode
+ SVSHAPE0[18:20] <- 0b100 # DCT Outer Butterfly sub-mode
+ SVSHAPE0[6:11] <- 0b000010 # DCT Butterfly mode
+ # copy
+ SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule
+ SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients
+ # for FRA and FRT
+ SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
+ # reset costable "striding" to 1
+ SVSHAPE2[12:17] <- 0b000000
+ # set schedule up for DCT COS table generation
+ if (SVrm = 0b0101) | (SVrm = 0b1101) then
+ # calculate O(N log2 N)
+ vlen[0:6] <- [0] * 7
+ itercount[0:6] <- (0b00 || SVxd) + 0b0000001
+ itercount[0:6] <- (0b0 || itercount[0:5])
+ n <- [0] * 3
+ do while n < 5
+ if SVxd[4-n] = 0 then
+ leave
+ n <- n + 1
+ vlen[0:6] <- vlen + itercount
+ itercount[0:6] <- (0b0 || itercount[0:5])
+ # set up template in SVSHAPE0, then copy to 1-3
+ # set up FRB and FRS
+ SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
+ SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
+ SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode
+ SVSHAPE0[6:11] <- 0b000100 # DCT Inner Butterfly COS-gen mode
+ if (SVrm = 0b0101) then
+ SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop for DCT
+ # copy
+ SVSHAPE1[0:31] <- SVSHAPE0[0:31]
+ SVSHAPE2[0:31] <- SVSHAPE0[0:31]
+ # for cos coefficient
+ SVSHAPE1[28:29] <- 0b10 # ci schedule
+ SVSHAPE2[28:29] <- 0b11 # size schedule
+ # set schedule up for iDCT / DCT inverse of half-swapped ordering
+ if (SVrm = 0b0110) | (SVrm = 0b1110) | (SVrm = 0b1111) then
+ vlen[0:6] <- (0b00 || SVxd) + 0b0000001
+ # set up template in SVSHAPE0
+ SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
+ SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
+ if (SVrm = 0b1110) then
+ SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap
+ if (SVrm = 0b1111) then
+ SVSHAPE0[30:31] <- 0b01 # FFT mode
+ else
+ SVSHAPE0[30:31] <- 0b11 # DCT mode
+ SVSHAPE0[6:11] <- 0b000101 # DCT "half-swap" mode
+ # set schedule up for parallel reduction or prefix-sum
+ if (SVrm = 0b0111) then
+ # is scan/prefix-sum
+ is_scan <- SVyd = 2
+ # calculate the total number of operations (brute-force)
+ vlen[0:6] <- [0] * 7
+ itercount[0:6] <- (0b00 || SVxd) + 0b0000001
+ if is_scan then
+ # prefix sum algorithm with operations replaced with
+ # incrementing vlen
+ dist <- 1
+ vlen[0:6] <- 0
+ do while dist <u itercount
+ start <- dist * 2 - 1
+ step <- dist * 2
+ i <- start
+ do while i <u itercount
+ vlen[0:6] <- vlen[0:6] + 1
+ i <- i + step
+ dist <- dist * 2
+ dist <- dist / 2
+ do while dist != 0
+ i <- dist * 3 - 1
+ do while i <u itercount
+ vlen[0:6] <- vlen[0:6] + 1
+ i <- i + dist * 2
+ dist <- dist / 2
+ else
+ step <- 0b0000001
+ i <- 0b0000000
+ do while step <u itercount
+ newstep <- step[1:6] || 0b0
+ j[0:6] <- 0b0000000
+ do while (j+step <u itercount)
+ j <- j + newstep
+ i <- i + 1
+ step <- newstep
+ # VL in Parallel-Reduce is the number of operations
+ vlen[0:6] <- i
+ # set up template in SVSHAPE0, then copy to 1. only 2 needed
+ SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
+ SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
+ mscale <- (0b0 || SVzd) + 1
+ SVSHAPE0[30:31] <- 0b10 # parallel reduce/prefix submode
+ # copy
+ SVSHAPE1[0:31] <- SVSHAPE0[0:31]
+ # set up submodes: parallel or prefix
+ SVSHAPE0[28:29] <- 0b00 # left operand
+ SVSHAPE1[28:29] <- 0b01 # right operand
+ if is_scan then
+ SVSHAPE0[28:29] <- 0b10 # left operand
+ SVSHAPE1[28:29] <- 0b11 # right operand
+ # set VL, MVL and Vertical-First
+ m[0:12] <- vlen * mscale
+ maxvl[0:6] <- m[6:12]
+ SVSTATE[0:6] <- maxvl # MAVXL
+ SVSTATE[7:13] <- vlen # VL
+ SVSTATE[63] <- vf