From 14a3722e345878d5b41d592c87e526ee1b622ac5 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos.margaritis@vectorcamp.gr>
Date: Wed, 26 Apr 2023 14:31:49 +0000
Subject: [PATCH] WIP: initial commit of chacha20 SVP64 page

---
 openpower/sv/cookbook/chacha20.mdwn | 338 ++++++++++++++++++++++++++++
 1 file changed, 338 insertions(+)
 create mode 100644 openpower/sv/cookbook/chacha20.mdwn

diff --git a/openpower/sv/cookbook/chacha20.mdwn b/openpower/sv/cookbook/chacha20.mdwn
new file mode 100644
index 000000000..74c7d9627
--- /dev/null
+++ b/openpower/sv/cookbook/chacha20.mdwn
@@ -0,0 +1,338 @@
+[[!tag svp64_cookbook]]
+
+# ChaCha20 SVP64 Implementation Analysis
+
+Test
+
+Main loop for xchacha_hchacha20:
+
+for (i = 0; i < 10; i++){
+    QUARTERROUND(x0, x4,  x8, x12);
+    QUARTERROUND(x1, x5,  x9, x13);
+    QUARTERROUND(x2, x6, x10, x14);
+    QUARTERROUND(x3, x7, x11, x15);
+    QUARTERROUND(x0, x5, x10, x15);
+    QUARTERROUND(x1, x6, x11, x12);
+    QUARTERROUND(x2, x7,  x8, x13);
+    QUARTERROUND(x3, x4,  x9, x14);
+}
+
+#define QUARTERROUND(a,b,c,d) \
+    a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
+    c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
+    a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
+    c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
+    
+We see that the loop is split in two groups of QUARTERROUND calls,
+one with step=4:
+
+    QUARTERROUND(x0, x4,  x8, x12);
+    QUARTERROUND(x1, x5,  x9, x13);
+    QUARTERROUND(x2, x6, x10, x14);
+    QUARTERROUND(x3, x7, x11, x15);
+    
+and another with step=5:
+
+    QUARTERROUND(x0, x5, x10, x15);
+    QUARTERROUND(x1, x6, x11, x12);
+    QUARTERROUND(x2, x7,  x8, x13);
+    QUARTERROUND(x3, x4,  x9, x14);
+    
+Let's start with the first group of QUARTERROUNDs, by unrolling it,
+essentially it results in the following instructions:
+
+    x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 16);
+    x8  = x8 + x12;   x4 = ROTATE(x4 ^ x8, 12);
+    x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 8);
+    x8  = x8 + x12;   x4 = ROTATE(x4 ^ x8, 7);
+    x1  = x1 + x5;   x13 = ROTATE(x13 ^ x1, 16);
+    x9  = x9 + x13;   x5 = ROTATE(x5 ^ x9, 12);
+    x1  = x1 + x5;   x13 = ROTATE(x13 ^ x1, 8);
+    x9  = x9 + x13;   x5 = ROTATE(x5 ^ x9, 7);
+    x2  = x2 + x6;   x14 = ROTATE(x14 ^ x2, 16);
+    x10 = x10 + x14;  x6 = ROTATE(x6 ^ x10, 12);
+    x2  = x2 + x6;   x14 = ROTATE(x14 ^ x2, 8);
+    x10 = x10 + x14;  x6 = ROTATE(x6 ^ x10, 7);
+    x3  = x3 + x7;   x15 = ROTATE(x15 ^ x3, 16);
+    x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 12);
+    x3  = x3 + x7;   x15 = ROTATE(x15 ^ x3, 8);
+    x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 7);
+
+Second group of QUARTERROUNDs, unrolled:
+    x0  = x0 + x5;   x15 = ROTATE(x15 ^ x0, 16);
+    x10 = x10 + x15;  x5 = ROTATE(x5 ^ x10, 12);
+    x0  = x0 + x5;   x12 = ROTATE(x15 ^ x0, 8);
+    x10 = x10 + x15;  x5 = ROTATE(x5 ^ x10, 7);
+    x1  = x1 + x6;   x12 = ROTATE(x12 ^ x1, 16);
+    x11 = x11 + x12;  x6 = ROTATE(x6 ^ x11, 12);
+    x1  = x1 + x6;   x12 = ROTATE(x12 ^ x1, 8);
+    x11 = x11 + x12;  x6 = ROTATE(x6 ^ x11, 7);
+    x2  = x2 + x7;   x13 = ROTATE(x13 ^ x2, 16);
+    x8  = x8 + x13;   x7 = ROTATE(x7 ^ x8, 12);
+    x2  = x2 + x7;   x13 = ROTATE(x13 ^ x2, 8);
+    x8  = x8 + x13;   x7 = ROTATE(x7 ^ x8, 7);
+    x3  = x3 + x4;   x14 = ROTATE(x14 ^ x3, 16);
+    x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 12);
+    x3  = x3 + x4;   x14 = ROTATE(x14 ^ x3, 8);
+    x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 7);
+    
+Let's list the additions only:
+    
+    x0  = x0 + x4
+    x8  = x8 + x12
+    x0  = x0 + x4
+    x8  = x8 + x12
+    x1  = x1 + x5
+    x9  = x9 + x13
+    x1  = x1 + x5
+    x9  = x9 + x13
+    x2  = x2 + x6
+    x10 = x10 + x14
+    x2  = x2 + x6
+    x10 = x10 + x14
+    x3  = x3 + x7
+    x11 = x11 + x15
+    x3  = x3 + x7
+    x11 = x11 + x15
+    x0  = x0 + x5
+    x10 = x10 + x15
+    x0  = x0 + x5
+    x10 = x10 + x15
+    x1  = x1 + x6
+    x11 = x11 + x12
+    x1  = x1 + x6
+    x11 = x11 + x12
+    x2  = x2 + x7
+    x8  = x8 + x13
+    x2  = x2 + x7
+    x8  = x8 + x13
+    x3  = x3 + x4
+    x9  = x9 + x14
+    x3  = x3 + x4
+    x9  = x9 + x14
+    
+Since we're going to use Vertical-First mode (instructions are executed
+first, and an explicit "svstep" moves on to the next register), the
+additions will be executed one by one and we need to note the indices
+that are going to be used for each operation.  We remind that sv.add is
+the instruction that will be executed, in the form:
+
+    sv.add RT, RA, RB  # RT = RA + RB
+
+Let's assume the values x in the registers 24-36
+
+    GPR 24 |  x0 |  x1 |  x2 |  x3 |
+    GPR 28 |  x4 |  x5 |  x6 |  x7 |
+    GPR 32 |  x8 |  x9 | x10 | x11 |
+    GPR 36 | x12 | x13 | x14 | x15 |
+
+So for the addition in Vertical-First mode, RT (and RA as they are the
+same) indices are (in terms of x):
+
+   |  0 |  8 |  0 |  8 |  1 |  9 |  1 |  9 |
+   |  2 | 10 |  2 | 10 |  3 | 11 |  3 | 11 |
+   |  0 | 10 |  0 | 10 |  1 | 11 |  1 | 11 |
+   |  2 |  8 |  2 |  8 |  3 |  9 |  3 |  9 |
+
+However, since the indices are small values, using a single 64-bit
+register for a single index value is a waste so we will compress them,
+8 indices in a 64-bit register:
+So, RT indices will fit inside these 4 registers (in Little Endian format):
+
+   SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 |
+   
+Similarly we find the RB indices:
+
+   |  4 | 12 |  4 | 12 |  5 | 13 |  5 | 13 |
+   |  6 | 14 |  6 | 14 |  7 | 15 |  7 | 15 |
+   |  5 | 15 |  5 | 15 |  6 | 12 |  6 | 12 |
+   |  7 | 13 |  7 | 13 |  4 | 14 |  7 | 14 |
+   
+Using a similar method, we find the final 4 registers with the RB indices:
+   
+   SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 |
+   
+Now, we can construct the Vertical First loop:
+
+    svindex             4, 0, 1, 3, 0, 1, 0     # SVSHAPE0, add RA/RT indices
+    svindex             6, 1, 1, 3, 0, 1, 0     # SVSHAPE1, add RB indices
+    setvl               0, 0, 32, 1, 1, 1       # MAXVL=VL=32, VF=1
+    svremap             31, 1, 0, 0, 0, 0, 0    # RA=1, RB=0, RT=0 (0b01011)
+    sv.add/w=32         *x, *x, *x              # RT, RB: SHAPE0. RA: SHAPE1
+    svstep. 16, 1, 0                            # step to next in-regs element
+
+What this code snippet does is the following:
+
+The first instruction
+
+    svindex             4, 0, 1, 3, 0, 1, 0
+
+loads the add RT indices in the SVSHAPE0, in register 8. You will note
+that 4 is listed, but that's because it only works on even registers,
+so in order to save a bit, we have to double that number to get the
+actual register. So, SVSHAPE0 will be listed in GPRs 8-12. The number
+3 lists that the elements will be 8-bit long. 0=64-bit, 1=32-bit,
+2=16-bit, 3=8-bit.
+
+The next step instruction
+
+    svindex             6, 1, 1, 3, 0, 1, 0
+
+loads the add RB indices into SVSHAPE1. Again, even though we list 6,
+the actual registers will be loaded in GPR #12, again a use of 8-bit
+elements is denoted.
+Next, the setvl instructions:
+
+    setvl               0, 0, 32, 1, 1, 1
+
+We have to call setvl to set MAXVL and VL to 32 and also configure
+Vertical-First mode.  Afterwards, we have to instruct the way we intend
+to use the indices, and we do this using svremap.
+
+    svremap 31, 1, 0, 0, 0, 0, 0
+    
+svremap basically instructs the scheduler to use SVSHAPE0 for RT and RB,
+SVSHAPE1 for RA.  The next instruction performs the *actual* addition:
+
+    sv.add/w=32         *x, *x, *x
+    
+Note the /w=32 suffix. This instructs the adder to perform the operation
+in elements of w=32 bits. Since the Power CPU is a 64-bit CPU, this means
+that we need to have 2 32-bit elements loaded in each register. Also,
+note that in all parameters we use the *x as argument. This instructs
+the scheduler to act on the registers as a vector, or a sequence of
+elements. But even though they are all the same, their indices will be
+taken from the SVSHAPE0/SVSHAPE1 indices as defined previously. Also
+note that the indices are relative to the actual register used. So,
+if *x starts in GPR 24 for example, in essence this instruction will
+issue the following sequence of instructions:
+
+    add/w=32            24 + 0, 24 +  4, 24 + 0
+    add/w=32            24 + 8, 24 + 12, 24 + 8
+    add/w=32            24 + 0, 24 +  4, 24 + 0
+    add/w=32            24 + 8, 24 + 12, 24 + 8
+    add/w=32            24 + 1, 24 +  5, 24 + 1
+    add/w=32            24 + 9, 24 + 13, 24 + 9
+    add/w=32            24 + 1, 24 +  5, 24 + 1
+    add/w=32            24 + 9, 24 + 13, 24 + 9
+    ...
+
+Finally, the svstep. instruction steps to the next set of indices 
+
+We have shown how to do the additions in a Vertical-first mode. Now
+let's add the rest of the instructions in the QUARTERROUNDs.  For the
+XOR instructions of both QUARTERROUNDs groups only, assuming that d =
+XOR(d, a):
+
+    x12 = x12 ^ x0
+    x4 = x4 ^ x8
+    x12 = x12 ^ x0
+    x4 = x4 ^ x8
+    x13 = x13 ^ x1
+    x5 = x5 ^ x9
+    x13 = x13 ^ x1
+    x5 = x5 ^ x9
+    x14 = x14 ^ x2
+    x6 = x6 ^ x10
+    x14 = x14 ^ x2
+    x6 = x6 ^ x10
+    x15 = x15 ^ x3
+    x7 = x7 ^ x11
+    x15 = x15 ^ x3
+    x7 = x7 ^ x11
+    x15 = x15 ^ x0
+    x5 = x5 ^ x10
+    x12 = x15 ^ x0
+    x5 = x5 ^ x10
+    x12 = x12 ^ x1
+    x6 = x6 ^ x11
+    x12 = x12 ^ x1
+    x6 = x6 ^ x11
+    x13 = x13 ^ x2
+    x7 = x7 ^ x8
+    x13 = x13 ^ x1
+    x7 = x7 ^ x8
+    x14 = x14 ^ x3
+    x4 = x4 ^ x9
+    x14 = x14 ^ x3
+    x4 = x4 ^ x9
+
+We will need to create another set of indices for the XOR instructions. We
+will only need one set as the other set of indices is the same as RT
+for sv.add (SHAPE0). So, remembering that our
+    
+    | 12 |  4 | 12 |  4 | 13 |  5 | 13 |  5 |
+    | 14 |  6 | 14 |  6 | 15 |  7 | 15 |  7 |
+    | 15 |  5 | 15 |  5 | 12 |  6 | 12 |  6 |
+    | 13 |  7 | 13 |  7 | 14 |  4 | 14 |  4 |
+   
+Again, we find  
+
+    SVSHAPE2: | 0x50d050d040c040c | 0x70f070f060e060e | 0x60c060c050f050f | 0x40e040e070d070d |
+    
+The next operation is the ROTATE which takes as operand the result of the
+XOR and a shift argument. You can easily see that the indices used in this
+case are the same as the XOR. However, the shift values cycle every 4:
+16, 12, 8, 7. For the indices we can again use svindex, like this:
+
+    svindex             8, 2, 1, 3, 0, 1, 0
+    
+Which again means SVSHAPE2, operating on 8-bit elements, starting
+from GPR #16 (8*2). For the shift values cycling every 4 elements,
+the svshape2 instruction will be used:
+
+    svshape2            0, 0, 3, 4, 0, 1
+    
+This will create an SVSHAPE3, which will use a modulo 4 for all of its
+elements. Now we can list both XOR and ROTATE instructions in assembly,
+together with the respective svremap instructions:
+
+    svremap             31, 2, 0, 2, 2, 0, 0       # RA=2, RB=0, RS=2 (0b00111)
+    sv.xor/w=32         *x, *x, *x
+    svremap             31, 0, 3, 2, 2, 0, 0       # RA=2, RB=3, RS=2 (0b01110)
+    sv.rldcl/w=32       *x, *x, *SHIFTS, 0
+
+So, in a similar fashion, we instruct XOR (sv.xor) to use SVSHAPE2 for
+RA and RS and SVSHAPE0 for RB, again for 32-bit elements, while ROTATE
+(sv.rldcl) will also use SVSHAPE2 for RA and RS, but SVSHAPE3 for RB
+(the shift values, which cycle every 4 elements). Note that the actual
+indices for SVSHAPE3 will have to be in 32-bit elements:
+
+    SHIFTS: | 0x0000000c00000010 | 0x0000000700000008 |
+
+The complete algorithm for a loop with 10 iterations is as follows:
+    
+    li                  7, 10                     # Load value 10 into GPR #7
+    mtctr               7                         # Set up counter on GPR #7
+    
+    # set up VL=32 vertical-first, and SVSHAPEs 0-2
+    setvl               0, 0, 32, 1, 1, 1
+    # SHAPE0, used by sv.add starts at GPR #8
+    svindex             8/2, 0, 1, 3, 0, 1, 0     # SVSHAPE0, a
+    # SHAPE1, used by sv.xor starts at GPR #12
+    svindex             12/2, 1, 1, 3, 0, 1, 0    # SVSHAPE1, b
+    # SHAPE2, used by sv.rldcl starts at GPR #16
+    svindex             16/2, 2, 1, 3, 0, 1, 0    # SVSHAPE2, c
+    # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
+    # The inner loop will do 32 iterations, but there are only
+    # 4 shift values, so we mod by 4, and can cycle through them
+    svshape2            0, 0, 3, 4, 0, 1          # SVSHAPE3, shift amount, mod4
+
+.outer:
+    # outer loop begins here (standard CTR loop)
+    setvl               0, 0, 32, 1, 1, 1         # MAXVL=VL=32, VF=1
+    # inner loop begins here. add-xor-rotl32 with remap, step, branch
+.inner:
+    svremap             31, 1, 0, 0, 0, 0, 0      # RA=1, RB=0, RT=0 (0b01011)
+    sv.add/w=32         *x, *x, *x
+    svremap             31, 2, 0, 2, 2, 0, 0      # RA=2, RB=0, RS=2 (0b00111)
+    sv.xor/w=32         *x, *x, *x
+    svremap             31, 0, 3, 2, 2, 0, 0      # RA=2, RB=3, RS=2 (0b01110)
+    sv.rldcl/w=32       *x, *x, *SHIFTS, 0
+    # 16 is the destination containing the result of svstep.
+    # it overlaps with SHAPE2 which is also 16. the first 8 indices
+    # will get corrupted.
+    svstep.             7, 1, 0               # step to next in-regs element
+    bc                  6, 3, .inner          # svstep. Rc=1 loop-end-condition?
+    # inner-loop done: outer loop standard CTR-decrement to setvl again
+    bdnz                .outer                # Loop until CTR is zero
-- 
2.30.2