# svstep SVL-Form * svstep RT,SVi,vf (Rc=0) * svstep. RT,SVi,vf (Rc=1) Pseudo-code: if SVi[3:4] = 0b11 then # store pack and unpack in SVSTATE SVSTATE[53] <- SVi[5] SVSTATE[54] <- SVi[6] RT <- [0]*62 || SVSTATE[53:54] else step <- SVSTATE_NEXT(SVi, vf) RT <- [0]*57 || step Special Registers Altered: CR0 (if Rc=1) # setvl SVL-Form * setvl RT,RA,SVi,vf,vs,ms (Rc=0) * setvl. RT,RA,SVi,vf,vs,ms (Rc=1) Pseudo-code: overflow <- 0b0 VLimm <- SVi + 1 # set or get MVL if ms = 1 then MVL <- VLimm[0:6] else MVL <- SVSTATE[0:6] # set or get VL if vs = 0 then VL <- SVSTATE[7:13] else if _RA != 0 then if (RA) >u 0b1111111 then VL <- 0b1111111 overflow <- 0b1 else VL <- (RA)[57:63] else if _RT = 0 then VL <- VLimm[0:6] else if CTR >u 0b1111111 then VL <- 0b1111111 overflow <- 0b1 else VL <- CTR[57:63] # limit VL to within MVL if VL >u MVL then overflow <- 0b1 VL <- MVL SVSTATE[0:6] <- MVL SVSTATE[7:13] <- VL if _RT != 0 then GPR(_RT) <- [0]*57 || VL # MAXVL is a static "state-reset". if ms = 1 then SVSTATE[63] <- vf # set Vertical-First mode SVSTATE[62] <- 0b0 # clear persist bit Special Registers Altered: CR0 (if Rc=1) # svremap SVRM-Form * svremap SVme,mi0,mi1,mi2,mo0,mo1,pst Pseudo-code: # registers RA RB RC RT EA/FRS SVSHAPE0-3 indices SVSTATE[32:33] <- mi0 SVSTATE[34:35] <- mi1 SVSTATE[36:37] <- mi2 SVSTATE[38:39] <- mo0 SVSTATE[40:41] <- mo1 # enable bit for RA RB RC RT EA/FRS SVSTATE[42:46] <- SVme # persistence bit (applies to more than one instruction) SVSTATE[62] <- pst Special Registers Altered: None # svshape SVM-Form * svshape SVxd,SVyd,SVzd,SVrm,vf Pseudo-code: # for convenience, VL to be calculated and stored in SVSTATE vlen <- [0] * 7 mscale[0:5] <- 0b000001 # for scaling MAXVL itercount[0:6] <- [0] * 7 SVSTATE[0:31] <- [0] * 32 # only overwrite REMAP if "persistence" is zero if (SVSTATE[62] = 0b0) then SVSTATE[32:33] <- 0b00 SVSTATE[34:35] <- 0b00 SVSTATE[36:37] <- 0b00 SVSTATE[38:39] <- 0b00 SVSTATE[40:41] <- 0b00 SVSTATE[42:46] <- 0b00000 SVSTATE[62] <- 0b0 SVSTATE[63] <- 0b0 # clear out all SVSHAPEs SVSHAPE0[0:31] <- [0] * 32 SVSHAPE1[0:31] <- [0] * 32 SVSHAPE2[0:31] <- [0] * 32 SVSHAPE3[0:31] <- [0] * 32 # set schedule up for multiply if (SVrm = 0b0000) then # VL in Matrix Multiply is xd*yd*zd xd <- (0b00 || SVxd) + 1 yd <- (0b00 || SVyd) + 1 zd <- (0b00 || SVzd) + 1 n <- xd * yd * zd vlen[0:6] <- n[14:20] # set up template in SVSHAPE0, then copy to 1-3 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[6:11] <- (0b0 || SVyd) # ydim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim SVSHAPE0[28:29] <- 0b11 # skip z # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] SVSHAPE2[0:31] <- SVSHAPE0[0:31] SVSHAPE3[0:31] <- SVSHAPE0[0:31] # set up FRA SVSHAPE1[18:20] <- 0b001 # permute x,z,y SVSHAPE1[28:29] <- 0b01 # skip z # FRC SVSHAPE2[18:20] <- 0b001 # permute x,z,y SVSHAPE2[28:29] <- 0b11 # skip y # set schedule up for FFT butterfly if (SVrm = 0b0001) then # calculate O(N log2 N) n <- [0] * 3 do while n < 5 if SVxd[4-n] = 0 then leave n <- n + 1 n <- ((0b0 || SVxd) + 1) * n vlen[0:6] <- n[1:7] # set up template in SVSHAPE0, then copy to 1-3 # for FRA and FRT SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D FFT) mscale <- (0b0 || SVzd) + 1 SVSHAPE0[30:31] <- 0b01 # Butterfly mode # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] SVSHAPE2[0:31] <- SVSHAPE0[0:31] # set up FRB and FRS SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule # FRC (coefficients) SVSHAPE2[28:29] <- 0b10 # k schedule # set schedule up for (i)DCT Inner butterfly # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode) if ((SVrm = 0b0100) | (SVrm = 0b1100)) then # calculate O(N log2 N) n <- [0] * 3 do while n < 5 if SVxd[4-n] = 0 then leave n <- n + 1 n <- ((0b0 || SVxd) + 1) * n vlen[0:6] <- n[1:7] # set up template in SVSHAPE0, then copy to 1-3 # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) mscale <- (0b0 || SVzd) + 1 if (SVrm = 0b1100) then SVSHAPE0[30:31] <- 0b11 # iDCT mode SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode else SVSHAPE0[30:31] <- 0b01 # DCT mode SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop SVSHAPE0[6:11] <- 0b000011 # (i)DCT Inner Butterfly mode 4 # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] SVSHAPE2[0:31] <- SVSHAPE0[0:31] if (SVrm != 0b0100) & (SVrm != 0b1100) then SVSHAPE3[0:31] <- SVSHAPE0[0:31] # for FRA and FRT SVSHAPE0[28:29] <- 0b01 # j+halfstep schedule # for cos coefficient SVSHAPE2[28:29] <- 0b10 # ci (k for mode 4) schedule SVSHAPE2[12:17] <- 0b000000 # reset costable "striding" to 1 if (SVrm != 0b0100) & (SVrm != 0b1100) then SVSHAPE3[28:29] <- 0b11 # size schedule # set schedule up for (i)DCT Outer butterfly if (SVrm = 0b0011) | (SVrm = 0b1011) then # calculate O(N log2 N) number of outer butterfly overlapping adds vlen[0:6] <- [0] * 7 n <- 0b000 size <- 0b0000001 itercount[0:6] <- (0b00 || SVxd) + 0b0000001 itercount[0:6] <- (0b0 || itercount[0:5]) do while n < 5 if SVxd[4-n] = 0 then leave n <- n + 1 count <- (itercount - 0b0000001) * size vlen[0:6] <- vlen + count[7:13] size[0:6] <- (size[1:6] || 0b0) itercount[0:6] <- (0b0 || itercount[0:5]) # set up template in SVSHAPE0, then copy to 1-3 # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) mscale <- (0b0 || SVzd) + 1 if (SVrm = 0b1011) then SVSHAPE0[30:31] <- 0b11 # iDCT mode SVSHAPE0[18:20] <- 0b011 # iDCT Outer Butterfly sub-mode SVSHAPE0[21:23] <- 0b101 # "inverse" on outer and inner loop else SVSHAPE0[30:31] <- 0b01 # DCT mode SVSHAPE0[18:20] <- 0b100 # DCT Outer Butterfly sub-mode SVSHAPE0[6:11] <- 0b000010 # DCT Butterfly mode # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients # for FRA and FRT SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule # reset costable "striding" to 1 SVSHAPE2[12:17] <- 0b000000 # set schedule up for DCT COS table generation if (SVrm = 0b0101) | (SVrm = 0b1101) then # calculate O(N log2 N) vlen[0:6] <- [0] * 7 itercount[0:6] <- (0b00 || SVxd) + 0b0000001 itercount[0:6] <- (0b0 || itercount[0:5]) n <- [0] * 3 do while n < 5 if SVxd[4-n] = 0 then leave n <- n + 1 vlen[0:6] <- vlen + itercount itercount[0:6] <- (0b0 || itercount[0:5]) # set up template in SVSHAPE0, then copy to 1-3 # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) mscale <- (0b0 || SVzd) + 1 SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode SVSHAPE0[6:11] <- 0b000100 # DCT Inner Butterfly COS-gen mode if (SVrm = 0b0101) then SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop for DCT # copy SVSHAPE1[0:31] <- SVSHAPE0[0:31] SVSHAPE2[0:31] <- SVSHAPE0[0:31] # for cos coefficient SVSHAPE1[28:29] <- 0b10 # ci schedule SVSHAPE2[28:29] <- 0b11 # size schedule # set schedule up for iDCT / DCT inverse of half-swapped ordering if (SVrm = 0b0110) | (SVrm = 0b1110) | (SVrm = 0b1111) then vlen[0:6] <- (0b00 || SVxd) + 0b0000001 # set up template in SVSHAPE0 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT) mscale <- (0b0 || SVzd) + 1 if (SVrm = 0b1110) then SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap if (SVrm = 0b1111) then SVSHAPE0[30:31] <- 0b01 # FFT mode else SVSHAPE0[30:31] <- 0b11 # DCT mode SVSHAPE0[6:11] <- 0b000101 # DCT "half-swap" mode # set schedule up for parallel reduction or prefix-sum if (SVrm = 0b0111) then # is scan/prefix-sum is_scan <- SVyd = 2 # calculate the total number of operations (brute-force) vlen[0:6] <- [0] * 7 itercount[0:6] <- (0b00 || SVxd) + 0b0000001 if is_scan then # prefix sum algorithm with operations replaced with # incrementing vlen dist <- 1 vlen[0:6] <- 0 do while dist