From 5089b6098970f3ca8b7aaac244a0b0cbc778d077 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos.margaritis@vectorcamp.gr>
Date: Thu, 27 Apr 2023 09:28:17 +0000
Subject: [PATCH] VF intro, more formatting fixes

---
 openpower/sv/cookbook/chacha20.mdwn | 191 +++++++++++++++-------------
 1 file changed, 104 insertions(+), 87 deletions(-)

diff --git a/openpower/sv/cookbook/chacha20.mdwn b/openpower/sv/cookbook/chacha20.mdwn
index 98c8cfe4f..fec2b730f 100644
--- a/openpower/sv/cookbook/chacha20.mdwn
+++ b/openpower/sv/cookbook/chacha20.mdwn
@@ -2,28 +2,34 @@
 
 # XChaCha20 SVP64 Implementation Analysis
 
-## First, introduction to Vertical-First Mode
-
 ## Description of XChacha20 Algorithm
 
+We will first try to analyze the XChacha20 algorithm as found in:
+
+https://github.com/spcnvdr/xchacha20/blob/master/src/xchacha20.c
+
+The function under inspection is `xchacha_hchacha20`. If we notice we will that the main part of the computation, the main algorithm is just a for loop -which is also the same in the `xchacha_encrypt_bytes` function as well.
+
 Main loop for `xchacha_hchacha20`:
 
-    for (i = 0; i < 10; i++){
-        QUARTERROUND(x0, x4,  x8, x12);
-        QUARTERROUND(x1, x5,  x9, x13);
-        QUARTERROUND(x2, x6, x10, x14);
-        QUARTERROUND(x3, x7, x11, x15);
-        QUARTERROUND(x0, x5, x10, x15);
-        QUARTERROUND(x1, x6, x11, x12);
-        QUARTERROUND(x2, x7,  x8, x13);
-        QUARTERROUND(x3, x4,  x9, x14);
-    }
-
-    #define QUARTERROUND(a,b,c,d) \
-        a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
-        c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
-        a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
-        c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
+```
+for (i = 0; i < 10; i++){
+    QUARTERROUND(x0, x4,  x8, x12);
+    QUARTERROUND(x1, x5,  x9, x13);
+    QUARTERROUND(x2, x6, x10, x14);
+    QUARTERROUND(x3, x7, x11, x15);
+    QUARTERROUND(x0, x5, x10, x15);
+    QUARTERROUND(x1, x6, x11, x12);
+    QUARTERROUND(x2, x7,  x8, x13);
+    QUARTERROUND(x3, x4,  x9, x14);
+}
+
+#define QUARTERROUND(a,b,c,d) \
+    a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
+    c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
+    a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
+    c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
+```
     
 We see that the loop is split in two groups of `QUARTERROUND` calls,
 one with `step=4`:
@@ -39,7 +45,7 @@ and another with `step=5`:
     QUARTERROUND(x1, x6, x11, x12);
     QUARTERROUND(x2, x7,  x8, x13);
     QUARTERROUND(x3, x4,  x9, x14);
-    
+
 Let's start with the first group of `QUARTERROUND`s, by unrolling it,
 essentially it results in the following instructions:
 
@@ -80,7 +86,7 @@ Second group of `QUARTERROUND`s, unrolled:
     x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 7);
     
 Let's list the additions only:
-    
+
     x0  = x0 + x4
     x8  = x8 + x12
     x0  = x0 + x4
@@ -114,44 +120,52 @@ Let's list the additions only:
     x3  = x3 + x4
     x9  = x9 + x14
     
-Since we're going to use Vertical-First mode (instructions are executed
-first, and an explicit "svstep" moves on to the next register), the
-additions will be executed one by one and we need to note the indices
-that are going to be used for each operation.  We remind that sv.add is
-the instruction that will be executed, in the form:
+## Introduction to Vertical-First Mode
+
+We're going to use Vertical-First mode (VF) to implement this, so we will
+first do a short introduction to VF. In normal Horizontal Vector mode, or even in traditional SIMD mode, 
+the instruction is executed across a (Horizontal) Vector. So, if we have, for example `VL=8`, then the instruction
+
+    sv.add *RT, *RA, *RB  # RT = RA + RB
 
-    sv.add RT, RA, RB  # RT = RA + RB
+will be executed on all elements of the vector, **before** moving to the next assembly instruction.
+This behaviour changes in Vertical-First mode. In VF mode, the instruction is executed on a **single**
+element of the vector and then moves to the next assembly instruction. It continues to do so until execution
+reaches the `svstep` instruction where processing will be moved to the next element/register in the vector. Branching
+to the beginning of the loop does not occur automatically though, a branch instruction will have to be added manually.
 
-Let's assume the values x in the registers 24-36
+# Application of VF mode in the Xchacha20 loop
 
-    GPR 24 |  x0 |  x1 |  x2 |  x3 |
-    GPR 28 |  x4 |  x5 |  x6 |  x7 |
-    GPR 32 |  x8 |  x9 | x10 | x11 |
-    GPR 36 | x12 | x13 | x14 | x15 |
+Let's assume the values `x` in the registers 24-36
 
-So for the addition in Vertical-First mode, RT (and RA as they are the
+| GPR 24 |  x0 |  x1 |  x2 |  x3 |
+| GPR 28 |  x4 |  x5 |  x6 |  x7 |
+| GPR 32 |  x8 |  x9 | x10 | x11 |
+| GPR 36 | x12 | x13 | x14 | x15 |
+
+So for the addition in Vertical-First mode, `RT` (and `RA` as they are the
 same) indices are (in terms of x):
 
-    |  0 |  8 |  0 |  8 |  1 |  9 |  1 |  9 |
-    |  2 | 10 |  2 | 10 |  3 | 11 |  3 | 11 |
-    |  0 | 10 |  0 | 10 |  1 | 11 |  1 | 11 |
-    |  2 |  8 |  2 |  8 |  3 |  9 |  3 |  9 |
+|  0 |  8 |  0 |  8 |  1 |  9 |  1 |  9 |
+|  2 | 10 |  2 | 10 |  3 | 11 |  3 | 11 |
+|  0 | 10 |  0 | 10 |  1 | 11 |  1 | 11 |
+|  2 |  8 |  2 |  8 |  3 |  9 |  3 |  9 |
 
 However, since the indices are small values, using a single 64-bit
 register for a single index value is a waste so we will compress them,
 8 indices in a 64-bit register:
-So, RT indices will fit inside these 4 registers (in Little Endian format):
+So, `RT` indices will fit inside these 4 registers (in Little Endian format):
 
     SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 |
    
 Similarly we find the RB indices:
 
-    |  4 | 12 |  4 | 12 |  5 | 13 |  5 | 13 |
-    |  6 | 14 |  6 | 14 |  7 | 15 |  7 | 15 |
-    |  5 | 15 |  5 | 15 |  6 | 12 |  6 | 12 |
-    |  7 | 13 |  7 | 13 |  4 | 14 |  7 | 14 |
+|  4 | 12 |  4 | 12 |  5 | 13 |  5 | 13 |
+|  6 | 14 |  6 | 14 |  7 | 15 |  7 | 15 |
+|  5 | 15 |  5 | 15 |  6 | 12 |  6 | 12 |
+|  7 | 13 |  7 | 13 |  4 | 14 |  7 | 14 |
    
-Using a similar method, we find the final 4 registers with the RB indices:
+Using a similar method, we find the final 4 registers with the `RB` indices:
    
     SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 |
    
@@ -170,7 +184,7 @@ The first instruction
 
     svindex             4, 0, 1, 3, 0, 1, 0
 
-loads the add RT indices in the `SVSHAPE0`, in register 8. You will note
+loads the add `RT` indices in the `SVSHAPE0`, in register 8. You will note
 that 4 is listed, but that's because it only works on even registers,
 so in order to save a bit, we have to double that number to get the
 actual register. So, `SVSHAPE0` will be listed in GPRs 8-12. The number
@@ -181,7 +195,7 @@ The next step instruction
 
     svindex             6, 1, 1, 3, 0, 1, 0
 
-loads the add RB indices into `SVSHAPE1`. Again, even though we list 6,
+loads the add `RB` indices into `SVSHAPE1`. Again, even though we list 6,
 the actual registers will be loaded in GPR #12, again a use of 8-bit
 elements is denoted.
 Next, the `setvl` instructions:
@@ -194,8 +208,8 @@ to use the indices, and we do this using `svremap`.
 
     svremap 31, 1, 0, 0, 0, 0, 0
     
-`svremap` basically instructs the scheduler to use `SVSHAPE0` for RT and RB,
-`SVSHAPE1` for RA.  The next instruction performs the **actual** addition:
+`svremap` basically instructs the scheduler to use `SVSHAPE0` for `RT` and `RB`,
+`SVSHAPE1` for `RA`.  The next instruction performs the **actual** addition:
 
     sv.add/w=32         *x, *x, *x
     
@@ -220,7 +234,7 @@ issue the following sequence of instructions:
     add/w=32            24 + 9, 24 + 13, 24 + 9
     ...
 
-Finally, the svstep. instruction steps to the next set of indices 
+Finally, the `svstep.` instruction steps to the next set of indices 
 
 We have shown how to do the additions in a Vertical-first mode. Now
 let's add the rest of the instructions in the `QUARTERROUND`s.  For the
@@ -261,13 +275,13 @@ XOR(d, a)`:
     x4 = x4 ^ x9
 
 We will need to create another set of indices for the `XOR` instructions. We
-will only need one set as the other set of indices is the same as RT
-for `sv.add (SHAPE0)`. So, remembering that our
+will only need one set as the other set of indices is the same as `RT`
+for `sv.add` (`SHAPE0`). So, remembering that our
     
-    | 12 |  4 | 12 |  4 | 13 |  5 | 13 |  5 |
-    | 14 |  6 | 14 |  6 | 15 |  7 | 15 |  7 |
-    | 15 |  5 | 15 |  5 | 12 |  6 | 12 |  6 |
-    | 13 |  7 | 13 |  7 | 14 |  4 | 14 |  4 |
+| 12 |  4 | 12 |  4 | 13 |  5 | 13 |  5 |
+| 14 |  6 | 14 |  6 | 15 |  7 | 15 |  7 |
+| 15 |  5 | 15 |  5 | 12 |  6 | 12 |  6 |
+| 13 |  7 | 13 |  7 | 14 |  4 | 14 |  4 |
    
 Again, we find  
 
@@ -304,38 +318,41 @@ indices for `SVSHAPE3` will have to be in 32-bit elements:
     SHIFTS: | 0x0000000c00000010 | 0x0000000700000008 |
 
 The complete algorithm for a loop with 10 iterations is as follows:
+
+```
+    li                  7, 10                     # Load value 10 into GPR #7
+    mtctr               7                         # Set up counter on GPR #7
     
-        li                  7, 10                     # Load value 10 into GPR #7
-        mtctr               7                         # Set up counter on GPR #7
-    
-        # set up VL=32 vertical-first, and SVSHAPEs 0-2
-        setvl               0, 0, 32, 1, 1, 1
-        # SHAPE0, used by sv.add starts at GPR #8
-        svindex             8/2, 0, 1, 3, 0, 1, 0     # SVSHAPE0, a
-        # SHAPE1, used by sv.xor starts at GPR #12
-        svindex             12/2, 1, 1, 3, 0, 1, 0    # SVSHAPE1, b
-        # SHAPE2, used by sv.rldcl starts at GPR #16
-        svindex             16/2, 2, 1, 3, 0, 1, 0    # SVSHAPE2, c
-        # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
-        # The inner loop will do 32 iterations, but there are only
-        # 4 shift values, so we mod by 4, and can cycle through them
-        svshape2            0, 0, 3, 4, 0, 1          # SVSHAPE3, shift amount, mod4
-
-    .outer:
-        # outer loop begins here (standard CTR loop)
-        setvl               0, 0, 32, 1, 1, 1         # MAXVL=VL=32, VF=1
-        # inner loop begins here. add-xor-rotl32 with remap, step, branch
-    .inner:
-        svremap             31, 1, 0, 0, 0, 0, 0      # RA=1, RB=0, RT=0 (0b01011)
-        sv.add/w=32         *x, *x, *x
-        svremap             31, 2, 0, 2, 2, 0, 0      # RA=2, RB=0, RS=2 (0b00111)
-        sv.xor/w=32         *x, *x, *x
-        svremap             31, 0, 3, 2, 2, 0, 0      # RA=2, RB=3, RS=2 (0b01110)
-        sv.rldcl/w=32       *x, *x, *SHIFTS, 0
-        # 16 is the destination containing the result of svstep.
-        # it overlaps with SHAPE2 which is also 16. the first 8 indices
-        # will get corrupted.
-        svstep.             7, 1, 0               # step to next in-regs element
-        bc                  6, 3, .inner          # svstep. Rc=1 loop-end-condition?
-        # inner-loop done: outer loop standard CTR-decrement to setvl again
-        bdnz                .outer                # Loop until CTR is zero
+    # set up VL=32 vertical-first, and SVSHAPEs 0-2
+    setvl               0, 0, 32, 1, 1, 1
+    # SHAPE0, used by sv.add starts at GPR #8
+    svindex             8/2, 0, 1, 3, 0, 1, 0     # SVSHAPE0, a
+    # SHAPE1, used by sv.xor starts at GPR #12
+    svindex             12/2, 1, 1, 3, 0, 1, 0    # SVSHAPE1, b
+    # SHAPE2, used by sv.rldcl starts at GPR #16
+    svindex             16/2, 2, 1, 3, 0, 1, 0    # SVSHAPE2, c
+    # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
+    # The inner loop will do 32 iterations, but there are only
+    # 4 shift values, so we mod by 4, and can cycle through them
+    svshape2            0, 0, 3, 4, 0, 1          # SVSHAPE3, shift amount, mod4
+.outer:
+    # outer loop begins here (standard CTR loop)
+    setvl               0, 0, 32, 1, 1, 1         # MAXVL=VL=32, VF=1
+    # inner loop begins here. add-xor-rotl32 with remap, step, branch
+.inner:
+    svremap             31, 1, 0, 0, 0, 0, 0      # RA=1, RB=0, RT=0 (0b01011)
+    sv.add/w=32         *x, *x, *x
+    svremap             31, 2, 0, 2, 2, 0, 0      # RA=2, RB=0, RS=2 (0b00111)
+    sv.xor/w=32         *x, *x, *x
+    svremap             31, 0, 3, 2, 2, 0, 0      # RA=2, RB=3, RS=2 (0b01110)
+    sv.rldcl/w=32       *x, *x, *SHIFTS, 0
+    # 16 is the destination containing the result of svstep.
+    # it overlaps with SHAPE2 which is also 16. the first 8 indices
+    # will get corrupted.
+    svstep.             7, 1, 0               # step to next in-regs element
+    bc                  6, 3, .inner          # svstep. Rc=1 loop-end-condition?
+    # inner-loop done: outer loop standard CTR-decrement to setvl again
+    bdnz                .outer                # Loop until CTR is zero
+```
+
+
-- 
2.30.2