<!-- This defines Draft SVP64 instructions to augment PowerISA Version 3.0 -->
<!-- These are not described in book 1 -->

# svstep

SVL-Form

* svstep RT,SVi,vf (Rc=0)
* svstep. RT,SVi,vf (Rc=1)

Pseudo-code:

    if SVi[3:4] = 0b11 then
        # store pack and unpack in SVSTATE
        SVSTATE[53] <- SVi[5]
        SVSTATE[54] <- SVi[6]
        RT <- [0]*62 || SVSTATE[53:54]
    else
        step <- SVSTATE_NEXT(SVi, vf)
        RT <- [0]*57 || step

Special Registers Altered:

    CR0                     (if Rc=1)

# setvl

SVL-Form

* setvl RT,RA,SVi,vf,vs,ms (Rc=0)
* setvl. RT,RA,SVi,vf,vs,ms (Rc=1)

Pseudo-code:

    overflow <- 0b0
    VLimm <- SVi + 1
    # set or get MVL
    if ms = 1 then MVL <- VLimm[0:6]
    else           MVL <- SVSTATE[0:6]
    # set or get VL
    if vs = 0                then VL <- SVSTATE[7:13]
    else if _RA != 0         then
        if (RA) >u 0b1111111 then
            VL <- 0b1111111
            overflow <- 0b1
        else                      VL <- (RA)[57:63]
    else if _RT = 0          then VL <- VLimm[0:6]
    else if CTR >u 0b1111111 then
        VL <- 0b1111111
        overflow <- 0b1
    else                          VL <- CTR[57:63]
    # limit VL to within MVL
    if VL >u MVL then
        overflow <- 0b1
        VL <- MVL
    SVSTATE[0:6] <- MVL
    SVSTATE[7:13] <- VL
    if _RT != 0 then
       GPR(_RT) <- [0]*57 || VL
    # MAXVL is a static "state-reset".
    if ms = 1 then
        SVSTATE[63] <- vf   # set Vertical-First mode
        SVSTATE[62] <- 0b0  # clear persist bit

Special Registers Altered:

    CR0                     (if Rc=1)

# svremap

SVRM-Form

* svremap SVme,mi0,mi1,mi2,mo0,mo1,pst

Pseudo-code:

    # registers RA RB RC RT EA/FRS SVSHAPE0-3 indices
    SVSTATE[32:33] <- mi0
    SVSTATE[34:35] <- mi1
    SVSTATE[36:37] <- mi2
    SVSTATE[38:39] <- mo0
    SVSTATE[40:41] <- mo1
    # enable bit for RA RB RC RT EA/FRS
    SVSTATE[42:46] <- SVme
    # persistence bit (applies to more than one instruction)
    SVSTATE[62] <- pst

Special Registers Altered:

    None

# svshape

SVM-Form

* svshape SVxd,SVyd,SVzd,SVrm,vf

Pseudo-code:

    # for convenience, VL to be calculated and stored in SVSTATE
    vlen <- [0] * 7
    mscale[0:5] <- 0b000001 # for scaling MAXVL
    itercount[0:6] <- [0] * 7
    SVSTATE[0:31] <- [0] * 32
    # only overwrite REMAP if "persistence" is zero
    if (SVSTATE[62] = 0b0) then
        SVSTATE[32:33] <- 0b00
        SVSTATE[34:35] <- 0b00
        SVSTATE[36:37] <- 0b00
        SVSTATE[38:39] <- 0b00
        SVSTATE[40:41] <- 0b00
        SVSTATE[42:46] <- 0b00000
        SVSTATE[62] <- 0b0
        SVSTATE[63] <- 0b0
    # clear out all SVSHAPEs
    SVSHAPE0[0:31] <- [0] * 32
    SVSHAPE1[0:31] <- [0] * 32
    SVSHAPE2[0:31] <- [0] * 32
    SVSHAPE3[0:31] <- [0] * 32
    # set schedule up for multiply
    if (SVrm = 0b0000) then
        # VL in Matrix Multiply is xd*yd*zd
        xd <- (0b00 || SVxd) + 1
        yd <- (0b00 || SVyd) + 1
        zd <- (0b00 || SVzd) + 1
        n <- xd * yd * zd
        vlen[0:6] <- n[14:20]
        # set up template in SVSHAPE0, then copy to 1-3
        SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
        SVSHAPE0[6:11] <- (0b0 || SVyd)   # ydim
        SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim
        SVSHAPE0[28:29] <- 0b11           # skip z
        # copy
        SVSHAPE1[0:31] <- SVSHAPE0[0:31]
        SVSHAPE2[0:31] <- SVSHAPE0[0:31]
        SVSHAPE3[0:31] <- SVSHAPE0[0:31]
        # set up FRA
        SVSHAPE1[18:20] <- 0b001          # permute x,z,y
        SVSHAPE1[28:29] <- 0b01           # skip z
        # FRC
        SVSHAPE2[18:20] <- 0b001          # permute x,z,y
        SVSHAPE2[28:29] <- 0b11           # skip y
    # set schedule up for FFT butterfly
    if (SVrm = 0b0001) then
        # calculate O(N log2 N)
        n <- [0] * 3
        do while n < 5
           if SVxd[4-n] = 0 then
               leave
           n <- n + 1
        n <- ((0b0 || SVxd) + 1) * n
        vlen[0:6] <- n[1:7]
        # set up template in SVSHAPE0, then copy to 1-3
        # for FRA and FRT
        SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
        SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D FFT)
        mscale <- (0b0 || SVzd) + 1
        SVSHAPE0[30:31] <- 0b01          # Butterfly mode
        # copy
        SVSHAPE1[0:31] <- SVSHAPE0[0:31]
        SVSHAPE2[0:31] <- SVSHAPE0[0:31]
        # set up FRB and FRS
        SVSHAPE1[28:29] <- 0b01           # j+halfstep schedule
        # FRC (coefficients)
        SVSHAPE2[28:29] <- 0b10           # k schedule
    # set schedule up for (i)DCT Inner butterfly
    # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode)
    if ((SVrm = 0b0100) |
        (SVrm = 0b1100)) then
        # calculate O(N log2 N)
        n <- [0] * 3
        do while n < 5
           if SVxd[4-n] = 0 then
               leave
           n <- n + 1
        n <- ((0b0 || SVxd) + 1) * n
        vlen[0:6] <- n[1:7]
        # set up template in SVSHAPE0, then copy to 1-3
        # set up FRB and FRS
        SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
        SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
        mscale <- (0b0 || SVzd) + 1
        if (SVrm = 0b1100) then
            SVSHAPE0[30:31] <- 0b11          # iDCT mode
            SVSHAPE0[18:20] <- 0b011         # iDCT Inner Butterfly sub-mode
        else
            SVSHAPE0[30:31] <- 0b01          # DCT mode
            SVSHAPE0[18:20] <- 0b001         # DCT Inner Butterfly sub-mode
            SVSHAPE0[21:23] <- 0b001         # "inverse" on outer loop
        SVSHAPE0[6:11] <- 0b000011       # (i)DCT Inner Butterfly mode 4
        # copy
        SVSHAPE1[0:31] <- SVSHAPE0[0:31]
        SVSHAPE2[0:31] <- SVSHAPE0[0:31]
        if (SVrm != 0b0100) & (SVrm != 0b1100) then
            SVSHAPE3[0:31] <- SVSHAPE0[0:31]
        # for FRA and FRT
        SVSHAPE0[28:29] <- 0b01           # j+halfstep schedule
        # for cos coefficient
        SVSHAPE2[28:29] <- 0b10           # ci (k for mode 4) schedule
        SVSHAPE2[12:17] <- 0b000000       # reset costable "striding" to 1
        if (SVrm != 0b0100) & (SVrm != 0b1100) then
            SVSHAPE3[28:29] <- 0b11           # size schedule
    # set schedule up for (i)DCT Outer butterfly
    if (SVrm = 0b0011) | (SVrm = 0b1011) then
        # calculate O(N log2 N) number of outer butterfly overlapping adds
        vlen[0:6] <- [0] * 7
        n <- 0b000
        size <- 0b0000001
        itercount[0:6] <- (0b00 || SVxd) + 0b0000001
        itercount[0:6] <- (0b0 || itercount[0:5])
        do while n < 5
           if SVxd[4-n] = 0 then
               leave
           n <- n + 1
           count <- (itercount - 0b0000001) * size
           vlen[0:6] <- vlen + count[7:13]
           size[0:6] <- (size[1:6] || 0b0)
           itercount[0:6] <- (0b0 || itercount[0:5])
        # set up template in SVSHAPE0, then copy to 1-3
        # set up FRB and FRS
        SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
        SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
        mscale <- (0b0 || SVzd) + 1
        if (SVrm = 0b1011) then
            SVSHAPE0[30:31] <- 0b11      # iDCT mode
            SVSHAPE0[18:20] <- 0b011     # iDCT Outer Butterfly sub-mode
            SVSHAPE0[21:23] <- 0b101     # "inverse" on outer and inner loop
        else
            SVSHAPE0[30:31] <- 0b01      # DCT mode
            SVSHAPE0[18:20] <- 0b100     # DCT Outer Butterfly sub-mode
        SVSHAPE0[6:11] <- 0b000010       # DCT Butterfly mode
        # copy
        SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule
        SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients
        # for FRA and FRT
        SVSHAPE1[28:29] <- 0b01           # j+halfstep schedule
        # reset costable "striding" to 1
        SVSHAPE2[12:17] <- 0b000000
    # set schedule up for DCT COS table generation
    if (SVrm = 0b0101) | (SVrm = 0b1101) then
        # calculate O(N log2 N)
        vlen[0:6] <- [0] * 7
        itercount[0:6] <- (0b00 || SVxd) + 0b0000001
        itercount[0:6] <- (0b0 || itercount[0:5])
        n <- [0] * 3
        do while n < 5
           if SVxd[4-n] = 0 then
               leave
           n <- n + 1
           vlen[0:6] <- vlen + itercount
           itercount[0:6] <- (0b0 || itercount[0:5])
        # set up template in SVSHAPE0, then copy to 1-3
        # set up FRB and FRS
        SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
        SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
        mscale <- (0b0 || SVzd) + 1
        SVSHAPE0[30:31] <- 0b01          # DCT/FFT mode
        SVSHAPE0[6:11] <- 0b000100       # DCT Inner Butterfly COS-gen mode
        if (SVrm = 0b0101) then
            SVSHAPE0[21:23] <- 0b001     # "inverse" on outer loop for DCT
        # copy
        SVSHAPE1[0:31] <- SVSHAPE0[0:31]
        SVSHAPE2[0:31] <- SVSHAPE0[0:31]
        # for cos coefficient
        SVSHAPE1[28:29] <- 0b10           # ci schedule
        SVSHAPE2[28:29] <- 0b11           # size schedule
    # set schedule up for iDCT / DCT inverse of half-swapped ordering
    if (SVrm = 0b0110) | (SVrm = 0b1110) | (SVrm = 0b1111) then
        vlen[0:6] <- (0b00 || SVxd) + 0b0000001
        # set up template in SVSHAPE0
        SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
        SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
        mscale <- (0b0 || SVzd) + 1
        if (SVrm = 0b1110) then
            SVSHAPE0[18:20] <- 0b001     # DCT opposite half-swap
        if (SVrm = 0b1111) then
            SVSHAPE0[30:31] <- 0b01          # FFT mode
        else
            SVSHAPE0[30:31] <- 0b11          # DCT mode
        SVSHAPE0[6:11] <- 0b000101       # DCT "half-swap" mode
    # set schedule up for parallel reduction or prefix-sum
    if (SVrm = 0b0111) then
        # is scan/prefix-sum
        is_scan <- SVyd = 2
        # calculate the total number of operations (brute-force)
        vlen[0:6] <- [0] * 7
        itercount[0:6] <- (0b00 || SVxd) + 0b0000001
        if is_scan then
            # prefix sum algorithm with operations replaced with
            # incrementing vlen
            dist <- 1
            vlen[0:6] <- 0
            do while dist <u itercount
                start <- dist * 2 - 1
                step <- dist * 2
                i <- start
                do while i <u itercount
                    vlen[0:6] <- vlen[0:6] + 1
                    i <- i + step
                dist <- dist * 2
            dist <- dist / 2
            do while dist != 0
                i <- dist * 3 - 1
                do while i <u itercount
                    vlen[0:6] <- vlen[0:6] + 1
                    i <- i + dist * 2
                dist <- dist / 2
        else
            step <- 0b0000001
            i <- 0b0000000
            do while step <u itercount
                newstep <- step[1:6] || 0b0
                j[0:6] <- 0b0000000
                do while (j+step <u itercount)
                    j <- j + newstep
                    i <- i + 1
                step <- newstep
            # VL in Parallel-Reduce is the number of operations
            vlen[0:6] <- i
        # set up template in SVSHAPE0, then copy to 1. only 2 needed
        SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
        SVSHAPE0[12:17] <- (0b0 || SVzd)   # zdim - "striding" (2D DCT)
        mscale <- (0b0 || SVzd) + 1
        SVSHAPE0[30:31] <- 0b10          # parallel reduce/prefix submode
        # copy
        SVSHAPE1[0:31] <- SVSHAPE0[0:31]
        # set up submodes: parallel or prefix
        SVSHAPE0[28:29] <- 0b00   # left operand
        SVSHAPE1[28:29] <- 0b01   # right operand
        if is_scan then
            SVSHAPE0[28:29] <- 0b10   # left operand
            SVSHAPE1[28:29] <- 0b11   # right operand
    # set VL, MVL and Vertical-First
    m[0:12] <- vlen * mscale
    maxvl[0:6] <- m[6:12]
    SVSTATE[0:6] <- maxvl  # MAVXL
    SVSTATE[7:13] <- vlen  # VL
    SVSTATE[63] <- vf

Special Registers Altered:

    None

# svindex

SVI-Form

* svindex SVG,rmm,SVd,ew,SVyx,mm,sk

Pseudo-code:

    # based on nearest MAXVL compute other dimension
    MVL <- SVSTATE[0:6]
    d <- [0] * 6
    dim <- SVd+1
    do while d*dim <u ([0]*4 || MVL)
       d <- d + 1
    # set up template, then copy once location identified
    shape <- [0]*32
    shape[30:31] <- 0b00            # mode
    if SVyx = 0 then
        shape[18:20] <- 0b110       # indexed xd/yd
        shape[0:5] <- (0b0 || SVd)  # xdim
        if sk = 0 then shape[6:11] <- 0 # ydim
        else           shape[6:11] <- 0b111111 # ydim max
    else
        shape[18:20] <- 0b111       # indexed yd/xd
        if sk = 1 then shape[6:11] <- 0 # ydim
        else           shape[6:11] <- d-1 # ydim max
        shape[0:5] <- (0b0 || SVd) # ydim
    shape[12:17] <- (0b0 || SVG)        # SVGPR
    shape[28:29] <- ew                  # element-width override
    shape[21] <- sk                     # skip 1st dimension
    # select the mode for updating SVSHAPEs
    SVSTATE[62] <- mm # set or clear persistence
    if mm = 0 then
        # clear out all SVSHAPEs first
        SVSHAPE0[0:31] <- [0] * 32
        SVSHAPE1[0:31] <- [0] * 32
        SVSHAPE2[0:31] <- [0] * 32
        SVSHAPE3[0:31] <- [0] * 32
        SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
        SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
        idx <- 0
        for bit = 0 to 4
            if rmm[4-bit] then
                # activate requested shape
                if idx = 0 then SVSHAPE0 <- shape
                if idx = 1 then SVSHAPE1 <- shape
                if idx = 2 then SVSHAPE2 <- shape
                if idx = 3 then SVSHAPE3 <- shape
                SVSTATE[bit*2+32:bit*2+33] <- idx
                # increment shape index, modulo 4
                if idx = 3 then idx <- 0
                else            idx <- idx + 1
    else
        # refined SVSHAPE/REMAP update mode
        bit <- rmm[0:2]
        idx <- rmm[3:4]
        if idx = 0 then SVSHAPE0 <- shape
        if idx = 1 then SVSHAPE1 <- shape
        if idx = 2 then SVSHAPE2 <- shape
        if idx = 3 then SVSHAPE3 <- shape
        SVSTATE[bit*2+32:bit*2+33] <- idx
        SVSTATE[46-bit] <- 1

Special Registers Altered:

    None

# svshape2

SVM2-Form

* svshape2 SVo,SVyx,rmm,SVd,sk,mm

Pseudo-code:

    # based on nearest MAXVL compute other dimension
    MVL <- SVSTATE[0:6]
    d <- [0] * 6
    dim <- SVd+1
    do while d*dim <u ([0]*4 || MVL)
       d <- d + 1
    # set up template, then copy once location identified
    shape <- [0]*32
    shape[30:31] <- 0b00            # mode
    shape[0:5] <- (0b0 || SVd)      # x/ydim
    if SVyx = 0 then
        shape[18:20] <- 0b000       # ordering xd/yd(/zd)
        if sk = 0 then shape[6:11] <- 0 # ydim
        else           shape[6:11] <- 0b111111 # ydim max
    else
        shape[18:20] <- 0b010       # ordering yd/xd(/zd)
        if sk = 1 then shape[6:11] <- 0 # ydim
        else           shape[6:11] <- d-1 # ydim max
    # offset (the prime purpose of this instruction)
    shape[24:27] <- SVo         # offset
    if sk = 1 then shape[28:29] <- 0b01 # skip 1st dimension
    else           shape[28:29] <- 0b00 # no skipping
    # select the mode for updating SVSHAPEs
    SVSTATE[62] <- mm # set or clear persistence
    if mm = 0 then
        # clear out all SVSHAPEs first
        SVSHAPE0[0:31] <- [0] * 32
        SVSHAPE1[0:31] <- [0] * 32
        SVSHAPE2[0:31] <- [0] * 32
        SVSHAPE3[0:31] <- [0] * 32
        SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
        SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
        idx <- 0
        for bit = 0 to 4
            if rmm[4-bit] then
                # activate requested shape
                if idx = 0 then SVSHAPE0 <- shape
                if idx = 1 then SVSHAPE1 <- shape
                if idx = 2 then SVSHAPE2 <- shape
                if idx = 3 then SVSHAPE3 <- shape
                SVSTATE[bit*2+32:bit*2+33] <- idx
                # increment shape index, modulo 4
                if idx = 3 then idx <- 0
                else            idx <- idx + 1
    else
        # refined SVSHAPE/REMAP update mode
        bit <- rmm[0:2]
        idx <- rmm[3:4]
        if idx = 0 then SVSHAPE0 <- shape
        if idx = 1 then SVSHAPE1 <- shape
        if idx = 2 then SVSHAPE2 <- shape
        if idx = 3 then SVSHAPE3 <- shape
        SVSTATE[bit*2+32:bit*2+33] <- idx
        SVSTATE[46-bit] <- 1

Special Registers Altered:

    None